# Test Code


In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import ijson
import time
start = time.process_time()

In [2]:

papers = []

with open('dblp_v12.json', "rb") as file:
    for i, element in enumerate(ijson.items(file, "item")):

        # Extracting the most valuable data
        paper = {}
        paper['id'] = element['id']
        paper['title'] = element['title']
        paper['authors'] = element.get('authors',np.nan)
        paper['year'] = element.get('year', np.nan)
        paper['n_citation'] = element.get('n_citation', 0)
        paper['doc_type'] = element.get('doc_type', np.nan)
        paper['references'] = element.get('references', np.nan)
        paper['doi'] = f"https://doi.org/{element['doi']}" if 'doi' in element else np.nan
        papers.append(paper)
        if i%100000 ==0:
            print(f"{i}:{round((time.process_time() - start),2)}s ",end="")

0:0.02s 100000:8.19s 200000:16.4s 300000:26.97s 400000:38.46s 500000:50.5s 600000:61.21s 700000:74.05s 800000:86.53s 900000:98.62s 1000000:112.79s 1100000:125.19s 1200000:137.66s 1300000:149.19s 1400000:164.21s 1500000:175.56s 1600000:189.42s 1700000:201.11s 1800000:213.03s 1900000:226.79s 2000000:239.1s 2100000:251.24s 2200000:264.36s 2300000:277.3s 2400000:291.99s 2500000:304.89s 2600000:318.11s 2700000:331.33s 2800000:344.59s 2900000:357.56s 3000000:367.16s 3100000:380.54s 3200000:391.78s 3300000:401.39s 3400000:415.23s 3500000:428.42s 3600000:441.89s 3700000:451.44s 3800000:464.96s 3900000:479.58s 4000000:493.25s 4100000:503.02s 4200000:516.42s 4300000:525.83s 4400000:539.47s 4500000:546.57s 4600000:558.09s 4700000:567.12s 4800000:576.01s 

In [3]:
csvread = pd.DataFrame(papers)


In [4]:
first_10k_rows = csvread.sort_values(by="n_citation",ascending=False).iloc[:10000]

In [65]:
testrows = pd.read_csv("first10k.csv")

In [33]:
first_10k_rows

Unnamed: 0,id,title,authors,year,n_citation,doc_type,references,doi
4696136,2041404167,The Mathematical Theory of Communication,"[{'name': 'C. E. Shannon', 'org': 'External Or...",1949,48327,Book,,https://doi.org/
4630907,1639032689,"Genetic algorithms in search, optimization, an...","[{'name': 'David E. Goldberg', 'id': 2102678951}]",1989,44175,Book,,https://doi.org/
4092588,2912565176,Fuzzy sets,"[{'name': 'Lotfi A. Zadeh', 'id': 2252586558}]",1996,42437,,,https://doi.org/
2937610,2151103935,Distinctive Image Features from Scale-Invarian...,"[{'name': 'David G. Lowe', 'org': 'Computer Sc...",2004,35541,Journal,"[19720318, 1541642243, 1560959218, 1676552347,...",https://doi.org/10.1023/B:VISI.0000029664.9961...
4088311,2911964244,Random Forests,"[{'name': 'Leo Breiman', 'org': 'Statistics De...",2001,34741,,"[1507255258, 1580948147, 1605688901, 197584664...",https://doi.org/10.1023/A:1010933404324
...,...,...,...,...,...,...,...,...
794854,1964830323,An overview of JML tools and applications,"[{'name': 'Lilian Burdy', 'org': 'INRIA Sophia...",2005,596,Conference,"[1486696980, 1489778371, 1492315860, 149894653...",https://doi.org/10.1007/s10009-004-0167-4
1705406,2060553764,Trust and e-commerce: a study of consumer perc...,"[{'name': 'Brian J. Corbitt', 'org': 'School o...",2003,596,Journal,"[1492586516, 1515344919, 1572145800, 170474358...",https://doi.org/10.1016/S1567-4223(03)00024-3
1782398,2068691410,Geographic routing in city scenarios,"[{'name': 'Christian Lochert', 'org': 'Heinric...",2005,596,Journal,"[1554193878, 2101963262, 2151800518, 2156689181]",https://doi.org/10.1145/1055959.1055970
2621703,2147343704,EVENODD: an efficient scheme for tolerating do...,"[{'name': 'M. Blaum', 'org': 'IBM Almaden Rese...",1995,596,Journal,"[1530042190, 1531975040, 1820898047, 182954746...",https://doi.org/10.1109/12.364531


In [8]:
def create_citations_graph(dataframe: pd.DataFrame):
    # Creating a graph
    citation_graph = nx.DiGraph()

    # Adding a node to the graph for each paper in the Dataframe in input
    citation_graph.add_nodes_from(dataframe['id'])

    # Adding directed edges based on citation relationships
    for _, row in dataframe.iterrows():
        paper_id = row['id']
        references = row['references']

        # Only add if the list of references has not NaN value
        if references is not np.nan: 
            # Only adding edges if both of the nodes are already in the graph
           # print([ref for ref in references])
            links = [(paper_id, ref) for ref in references if citation_graph.has_node(paper_id) and citation_graph.has_node(ref)]
            #print(links)
            citation_graph.add_edges_from(links)
           # print(citation_graph)
    
    return citation_graph


In [11]:
def create_collaboration_graph(dataframe: pd.DataFrame):
    # Creating a graph
    collaboration_graph = nx.Graph()

    # Iterating through papers in the DataFrame and adding as nodes all the authors for each paper
    for _, row in dataframe.iterrows():
        authors = row['authors']

        # Adding the author to the graph if there's only a single author without collaborators
        if authors is not np.nan and len(authors) == 1:
            author = authors[0]['id']
            # Adding the author only if they are not already present in the graph
            if not collaboration_graph.has_node(author):
                collaboration_graph.add_node(author)

        # Adding edges if there are least two authors
        if authors is not np.nan and len(authors) > 1:
            # Iterating over all pairs of authors
            for i in range(len(authors)):
                for j in range(i + 1, len(authors)):
                    author1 = authors[i]['id']
                    author2 = authors[j]['id']

                    # Adding the edge or increasing weight of the edge between authors
                    if author1 != author2: 
                        if collaboration_graph.has_edge(author1, author2):
                            # If the edge already exists, increase the weight by 1
                            collaboration_graph[author1][author2]['weight'] += 1
                        else:
                            # If the edge doesn't exist, create a new edge with weight 1. Nodes are automatically added if not already present in the graph
                            collaboration_graph.add_edge(author1, author2, weight=1)

    return collaboration_graph


In [9]:
citation_graph = create_citations_graph(first_10k_rows)


In [13]:
collaboration_graph = create_collaboration_graph(first_10k_rows)


In [10]:
print(citation_graph.number_of_nodes())
print(citation_graph.number_of_edges())


10000
52252


In [14]:
print(collaboration_graph.number_of_nodes())
print(collaboration_graph.number_of_edges())


21235
136478


In [427]:
def funct_1(graph,graph_name):

    if graph_name == "Citation Graph":

        ncit_nodes = graph.number_of_nodes() #nodes
        ncit_edges = graph.number_of_edges() #edge
        cit_graph_density = ncit_edges/(ncit_nodes*(ncit_nodes-1))#since the graph is directed we can compute the graph density using this formula
        print("The number of nodes is",ncit_nodes)
        print("The number of edges is",ncit_edges)
        print("The graph density is",cit_graph_density)
        max_theoretical_edges = int(ncit_nodes*(ncit_nodes-1)) #max edges for an undirected graph, to check if the grapgh is sparse
        nodeinfo = []
        indegreelist = []
        outdegreelist = []
        for node in graph:
            in_degree_node = graph.in_degree(node) 
            out_degree_node = graph.out_degree(node)
            total_degree_node =  graph.degree(node) #in_degree + out_degree
            indegreelist.append(in_degree_node)
            outdegreelist.append(out_degree_node)
            nodeinfo.append(total_degree_node)

        print("The (sorted) distribution of in_degrees is",sorted((indegreelist),reverse=True)) #plot the in_degree distrib
        print("The (sorted) distribution of out_degrees is",sorted((outdegreelist),reverse=True)) #plot the out_degree distrib
        print("The (sorted) distribution of degrees is",sorted((nodeinfo),reverse=True)) #we can plot 3 distribution, the in_degree, the out_degree and the total_degree
        sum_deg = sum(nodeinfo) 
        print("The average graph degree is",sum_deg/ncit_nodes)
        p = np.percentile(nodeinfo, 95)  # return 95th percentile
        node_hubs = [(node,graph.degree(node)) for node in graph if graph.degree(node) > p]
        print("The hubs are",node_hubs)
        threshold = ncit_edges/max_theoretical_edges
        if threshold > 0.7:
            print("The ratio between the number of edges and the theoretical maximum is",threshold,"hence the number of edges is close to the maximum theoretical value: The graph is DENSE")
        else: print("The ratio between the number of edges and the theoretical maximum is",threshold,"hence the number of edges is lover than the maximum theoretical value: The graph is SPARSE")

    if graph_name == "Collaboration Graph": 
        ncoll_nodes = graph.number_of_nodes() #number nodes
        ncoll_edges = graph.number_of_edges() #number edges
        coll_graph_density = 2*ncoll_edges/(ncoll_nodes*(ncoll_nodes-1)) #since the graph is undirected we can compute the graph density using this formula
        print("The number of nodes is",ncoll_nodes)
        print("The number of edges is",ncoll_edges)
        print("The graph density is",coll_graph_density)
        max_theoretical_edges = int(ncoll_nodes*(ncoll_nodes-1)/2) #max edges for an undirected graph, to check if the grapgh is sparse
        nodeinfo = []
        for node in graph:
            node_degree = graph.degree(node)
            nodeinfo.append(node_degree)
        degree_sequence = sorted(nodeinfo, reverse=True) #list of degrees of nodes, to get the distribution we can use a Counter (how many times the degree appears)
        print("The (sorted) distribution of degrees is",degree_sequence)
        sum_deg = sum(degree_sequence) #Handshaking Lemma, so the sum of each degree is just 2*number of edges
        print("The average graph degree is",sum_deg/ncoll_nodes)
        p = np.percentile(degree_sequence, 95)  # return 95th percentile
        node_hubs = [(node,graph.degree(node)) for node in graph if graph.degree(node) > p]
        print("The hubs are",node_hubs)
        threshold = ncoll_edges/max_theoretical_edges
        if threshold > 0.7:
            print("The ratio between the number of edges and the theoretical maximum is",threshold,"hence the number of edges is close to the maximum theoretical value: The graph is DENSE")
        else: print("The ratio between the number of edges and the theoretical maximum is",threshold,"hence the number of edges is lover than the maximum theoretical value: The graph is SPARSE")


    return 



In [428]:
funct_1(citation_graph,"Citation Graph")

The number of nodes is 10000
The number of edges is 52252
The graph density is 0.0005225722572257226
The (sorted) distribution of in_degrees is [131, 120, 117, 104, 95, 94, 89, 87, 78, 76, 76, 74, 71, 71, 70, 69, 68, 68, 68, 68, 67, 67, 66, 66, 65, 64, 64, 63, 63, 63, 62, 62, 61, 61, 61, 60, 60, 59, 59, 59, 58, 58, 57, 56, 56, 55, 55, 55, 54, 54, 54, 54, 53, 53, 53, 53, 52, 51, 49, 48, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 44, 44, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43, 43, 43, 43, 42, 42, 42, 41, 40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27,

In [429]:
funct_1(collaboration_graph,"Collaboration Graph")

The number of nodes is 21235
The number of edges is 136478
The graph density is 0.0006053528157956642
The (sorted) distribution of degrees is [347, 326, 304, 296, 284, 284, 282, 279, 278, 276, 273, 272, 272, 271, 270, 268, 267, 267, 267, 266, 266, 265, 263, 262, 262, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 26

In [379]:
def funct_2(graph,select_node_id,graph_name):

    if graph_name == "Citation Graph":
        selected_node_subgraph = graph.subgraph(graph[select_node_id]) #create a subgraph to analyze 
        print(nx.in_degree_centrality(selected_node_subgraph)) #we are interested in in-degree, since in-links are given by other nodes in the network, while out-links are determined by the node itself
        print(nx.betweenness_centrality(selected_node_subgraph))
        print(nx.closeness_centrality(selected_node_subgraph))
        print(nx.pagerank(selected_node_subgraph))  

    if graph_name == "Collaboration Graph":
        selected_node_subgraph = graph.subgraph(graph[select_node_id])
        print(nx.degree_centrality(selected_node_subgraph))
        print(nx.betweenness_centrality(selected_node_subgraph))
        print(nx.closeness_centrality(selected_node_subgraph))
        print(nx.pagerank(selected_node_subgraph))    


    return
    

In [380]:
funct_2(citation_graph,1981025032,"Citation Graph") #selection of the id and graph should be left to the user

{2149723649: 0.0, 2020246210: 0.23076923076923078, 2054658115: 0.0, 2165758113: 0.3076923076923077, 1580142630: 0.07692307692307693, 2084544490: 0.3076923076923077, 2137983211: 0.3076923076923077, 1971735090: 0.23076923076923078, 2103496339: 0.23076923076923078, 2166116275: 0.0, 1990517717: 0.0, 2171277043: 0.15384615384615385, 1570802136: 0.07692307692307693, 2076118331: 0.0}
{2149723649: 0.0, 2020246210: 0.14743589743589744, 2054658115: 0.0, 2165758113: 0.038461538461538464, 1580142630: 0.0, 2084544490: 0.1346153846153846, 2137983211: 0.01282051282051282, 1971735090: 0.05769230769230769, 2103496339: 0.04487179487179487, 2166116275: 0.0, 1990517717: 0.0, 2171277043: 0.04487179487179487, 1570802136: 0.0, 2076118331: 0.0}
{2149723649: 0.0, 2020246210: 0.3282051282051282, 2054658115: 0.0, 2165758113: 0.37869822485207105, 1580142630: 0.26525198938992045, 2084544490: 0.3282051282051282, 2137983211: 0.3282051282051282, 1971735090: 0.32793522267206476, 2103496339: 0.3076923076923077, 2166116

In [381]:
funct_2(collaboration_graph,2104401652,"Collaboration Graph") #selection of the id and graph should be left to the user

{2362074882: 0.06666666666666667, 2141511558: 0.016666666666666666, 2100217226: 0.03333333333333333, 2149153931: 0.18333333333333332, 2616052238: 0.05, 1699311759: 0.03333333333333333, 2493665552: 0.06666666666666667, 2079795601: 0.016666666666666666, 2127568146: 0.016666666666666666, 2119321235: 0.06666666666666667, 2117954069: 0.21666666666666667, 2614855417: 0.03333333333333333, 2162130455: 0.06666666666666667, 2235360923: 0.16666666666666666, 2677226523: 0.08333333333333333, 2146330397: 0.16666666666666666, 2342552992: 0.03333333333333333, 2079584417: 0.16666666666666666, 2397079202: 0.08333333333333333, 2517910439: 0.05, 2136811176: 0.06666666666666667, 2613931434: 0.08333333333333333, 2429370538: 0.16666666666666666, 2289542319: 0.03333333333333333, 2169116592: 0.16666666666666666, 2105326385: 0.06666666666666667, 2114426036: 0.15, 2143201973: 0.08333333333333333, 2112851132: 0.03333333333333333, 2128362942: 0.05, 1994222016: 0.16666666666666666, 2367911745: 0.1, 2141588545: 0.03