In [1]:
import pandas as pd
import networkx as nx

In [2]:
def create_citation_graph(dataframe):
    graph = nx.DiGraph()  
    nodes_list = set()
    edges_list = set()
    
    for paper_id in set(dataframe["id"]):
        nodes_list.add(str(paper_id))
    
    current = 1    
    for paper_id in set(dataframe["id"]):
        percentage = (current / len(dataframe.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(dataframe.loc[dataframe["id"] == paper_id].iloc[0])
        references = current_paper[7]
        for reference_id in references:
            if reference_id in nodes_list:
                edges_list.add( (str(paper_id), reference_id) )

        current += 1
    
    graph.add_nodes_from(nodes_list)
    graph.add_edges_from(edges_list)
    
    return graph

In [3]:
paper_dataframe = pd.read_json("data/nlp_papers.json")
citation_graph = create_citation_graph(paper_dataframe)

Current percentage:  100.00%

In [4]:
scores = nx.pagerank(citation_graph)
sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)

In [26]:
dataframe_dict = {"paper_id": [], "total_citations": [], "graph_citations": [], "pagerank": []}

for item in sorted_scores:
    input_links = citation_graph.in_edges(item[0])
    
    dataframe_dict["paper_id"].append(item[0])
    dataframe_dict["total_citations"].append(0)
    dataframe_dict["graph_citations"].append(len(input_links))
    dataframe_dict["pagerank"].append(item[1])

df = pd.DataFrame(dataframe_dict)
df.to_csv("data/stats.csv")

In [12]:
def create_subgraph(graph, start_node):
    sub = nx.DiGraph()
    nodes_list = set()
    edges_list = []
    
    ancestors = nx.ancestors(graph, start_node)
    for item in ancestors: nodes_list.add(item)
    
    edges_list += graph.in_edges(start_node)
    
    for node in nodes_list:
        edges_list += graph.in_edges(node)
        
    sub.add_nodes_from(nodes_list)
    sub.add_edges_from(edges_list)
        
    return sub

In [None]:
subgraph = create_subgraph(citation_graph, sorted_scores[0][0])
print(subgraph)