In [1]:
import pandas as pd
import networkx as nx

In [2]:
def create_citation_graph(dataframe):
    graph = nx.DiGraph()  
    nodes_list = set()
    edges_list = set()
    
    for paper_id in set(dataframe["id"]):
        nodes_list.add(str(paper_id))
    
    current = 1    
    for paper_id in set(dataframe["id"]):
        percentage = (current / len(dataframe.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(dataframe.loc[dataframe["id"] == paper_id].iloc[0])
        references = current_paper[7]
        for reference_id in references:
            if reference_id in nodes_list:
                edges_list.add( (str(paper_id), reference_id) )

        current += 1
    
    graph.add_nodes_from(nodes_list)
    graph.add_edges_from(edges_list)
    
    return graph

In [3]:
paper_dataframe = pd.read_json("data/nlp_papers.json", dtype={'id': str})
citation_graph = create_citation_graph(paper_dataframe)

Current percentage:  100.00%

In [4]:
scores = nx.pagerank(citation_graph)
sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
print(sorted_scores)

[('85125701306', 0.005894494408741687), ('85121413226', 0.005886671043025948), ('85145328435', 0.001774451874363547), ('85147846465', 0.0017732750234690936), ('85159639846', 0.0012942947369399716), ('85127267136', 0.0012875203147661711), ('85132156284', 0.0010689866033970835), ('85112567461', 0.0009231468117233715), ('85134345980', 0.0009079644936013541), ('85173572846', 0.0008408345533906672), ('85130398198', 0.0008397407439034611), ('85124138738', 0.0008181920056999742), ('85149104894', 0.0006861388765218111), ('85122429680', 0.0006637955904789957), ('85125362171', 0.0006535381032285002), ('85146493821', 0.0006460922753620506), ('85150379710', 0.000633814876795465), ('85150335084', 0.0006240343872830823), ('85130499422', 0.0006083577715898508), ('85122122541', 0.0005912084329113428), ('85122317264', 0.0005880046085909452), ('85121220398', 0.0005566736240350281), ('85126433074', 0.00055098785874592), ('85128157225', 0.000545914499987097), ('85148212303', 0.0005430355841254193), ('8513

In [5]:
dataframe_dict = {"paper_id": [], "total_citations": [], "graph_citations": [], "pagerank": []}

for item in sorted_scores:
    input_links = citation_graph.in_edges(item[0])
    
    dataframe_dict["paper_id"].append(item[0])
    dataframe_dict["total_citations"].append(0)
    dataframe_dict["graph_citations"].append(len(input_links))
    dataframe_dict["pagerank"].append(item[1])

df = pd.DataFrame(dataframe_dict)
df.to_csv("data/stats.csv")

In [5]:
def create_subgraph(graph, start_node):
    sub = nx.DiGraph()
    nodes_list = set()
    edges_list = []
    
    ancestors = nx.ancestors(graph, start_node)
    for item in ancestors: nodes_list.add(item)
    
    edges_list += graph.in_edges(start_node)
    
    for node in nodes_list:
        edges_list += graph.in_edges(node)
        
    sub.add_nodes_from(nodes_list)
    sub.add_edges_from(edges_list)
        
    return sub

In [None]:
ancestors = nx.ancestors(citation_graph, sorted_scores[0][0])
for item in ancestors: print(item)

In [4]:
def get_authors_name(input_list):
    dataframe = pd.read_json("data/nlp_authors.json")
    authors = []
    
    for item in input_list:
        row = tuple(dataframe.loc[:, int(item)])
        authors.append(row[0] + " " + row[1])
        
    return authors

In [6]:
sub = create_subgraph(citation_graph, "85115610435")

In [32]:
def get_node_colors(graph):
    pagerank_subgraph_scores = []
    color_gradients = []
    
    for node in graph.nodes:
        for couple in sorted_scores:
            if couple[0] == node: pagerank_subgraph_scores.append(couple)
            
    pagerank_subgraph_scores.sort(key = lambda x : x[1], reverse = True)
    print(pagerank_subgraph_scores)
    
    scores_set = set([x[1] for x in pagerank_subgraph_scores])
    max_score = max(scores_set)
    min_score = min(scores_set)
    
    steps = len(scores_set) - 1
    count = 1
    r_step = (124 - 69) / steps
    g_step = (223 - 145) / steps
    b_step = (218 - 80) / steps 
    
    for item in pagerank_subgraph_scores:
        
        if item[1] == max_score: color_gradients.append( (item[0], item[1], (124, 223, 80)) )
        elif item[1] == min_score: color_gradients.append( (item[0], item[1], (69, 145, 218)) )
        else:
            color_gradients.append( (item[0], item[1], (round(124 - r_step*count, 0), round(223 - g_step*count, 0), round(80 + b_step*count, 0))) )
            count += 1