In [1]:
import pandas as pd
import networkx as nx

In [2]:
def create_citation_graph(dataframe):
    graph = nx.DiGraph()  
    nodes_list = set()
    edges_list = set()
    
    for paper_id in set(dataframe["id"]):
        nodes_list.add(str(paper_id))
    
    current = 1    
    for paper_id in set(dataframe["id"]):
        percentage = (current / len(dataframe.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(dataframe.loc[dataframe["id"] == paper_id].iloc[0])
        references = current_paper[7]
        for reference_id in references:
            if reference_id in nodes_list:
                edges_list.add( (str(paper_id), reference_id) )
        current += 1
    
    graph.add_nodes_from(nodes_list)
    graph.add_edges_from(edges_list)
    return graph


def create_weighted_citation_graph(papers, similarities):
    graph = nx.DiGraph()  
    nodes_list = set()
    edges_list = []
    
    for paper_id in set(papers["id"]):
        nodes_list.add(str(paper_id))
    
    current = 1    
    for paper_id in set(papers["id"]):
        percentage = (current / len(papers.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(papers.loc[papers["id"] == paper_id].iloc[0])
        references = current_paper[7]
        for reference_id in references:
            if reference_id in nodes_list:
                key = str(paper_id) + "_" + reference_id
                current_similarity = tuple(similarities.loc[similarities["id"] == key].iloc[0])
                edges_list.append( (str(paper_id), reference_id, current_similarity[1]) )
        current += 1
    
    graph.add_nodes_from(nodes_list)
    graph.add_weighted_edges_from(edges_list)
    return graph

In [3]:
paper_dataframe = pd.read_json("data/nlp_papers.json", dtype={'id': str})
citation_graph = create_citation_graph(paper_dataframe)

Current percentage:  100.00%

In [8]:
scores = nx.pagerank(citation_graph)
sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)

In [6]:
paper_dataframe = pd.read_json("data/nlp_papers.json", dtype={'id': str})
similarity_dataframe = pd.read_json("data/similarities.json", dtype={'id': str})
weighted_citation_graph = create_weighted_citation_graph(paper_dataframe, similarity_dataframe)

Current percentage:  100.00%

In [9]:
weighted_scores = nx.pagerank(weighted_citation_graph)
weighted_sorted_scores = sorted(weighted_scores.items(), key = lambda x : x[1], reverse = True)

In [5]:
def get_nodes_stats(graph, scores, weighted):
    dataframe_dict = {"paper_id": [], "total_citations": [], "graph_citations": [], "pagerank": []}

    for item in scores:
        input_links = graph.in_edges(item[0])
        
        dataframe_dict["paper_id"].append(item[0])
        dataframe_dict["total_citations"].append(0)
        dataframe_dict["graph_citations"].append(len(input_links))
        dataframe_dict["pagerank"].append(item[1])

    df = pd.DataFrame(dataframe_dict)
    if weighted:
        df.to_csv("data/weighted_stats.csv")
    else: df.to_csv("data/regular_stats.csv")

In [7]:
get_nodes_stats(citation_graph, sorted_scores, False)
get_nodes_stats(weighted_citation_graph, weighted_sorted_scores, True)

In [34]:
def create_subgraph(graph, start_node):
    sub = nx.DiGraph()
    nodes_list = set()
    edges_list = []
    
    ancestors = nx.ancestors(graph, start_node)
    for item in ancestors: nodes_list.add(item)
    
    edges_list += graph.in_edges(start_node, data = True)
    
    for node in nodes_list:
        edges_list += graph.in_edges(node, data = True)
        
    sub.add_nodes_from(nodes_list)
    sub.add_edges_from(edges_list)
    return sub

In [37]:
sub = create_subgraph(citation_graph, "85173572846")
for edge in sub.edges(data = True):
    print(edge)

('85173568550', '85173572846', {})
('85173568550', '85173566625', {})
('85173568550', '85173556632', {})
('85173572600', '85173572846', {})
('85173563565', '85173572846', {})
('85173557933', '85173572846', {})
('85175331754', '85173559430', {})
('85175346912', '85173559430', {})
('85173571982', '85173572846', {})
('85173558408', '85173572846', {})
('85173563813', '85173572846', {})
('85173559671', '85173572846', {})
('85177180081', '85173572846', {})
('85177180081', '85173566625', {})
('85173557366', '85173572846', {})
('85173563873', '85173572846', {})
('85173563873', '85173568550', {})
('85173563873', '85173572600', {})
('85175300943', '85173559430', {})
('85173561082', '85173572846', {})
('85173572746', '85173572846', {})
('85173557488', '85173572846', {})
('85173572983', '85173572846', {})
('85173572983', '85173556632', {})
('85173566625', '85173572846', {})
('85173566625', '85173568550', {})
('85173563969', '85173572846', {})
('85173556632', '85173572846', {})
('85173556632', '851

In [4]:
def get_authors_name(input_list):
    dataframe = pd.read_json("data/nlp_authors.json")
    authors = []
    
    for item in input_list:
        row = tuple(dataframe.loc[:, int(item)])
        authors.append(row[0] + " " + row[1])
        
    return authors

In [1]:
def get_node_colors(graph):
    pagerank_subgraph_scores = []
    color_gradients = []
    
    for node in graph.nodes:
        for couple in sorted_scores:
            if couple[0] == node: pagerank_subgraph_scores.append(couple)
            
    pagerank_subgraph_scores.sort(key = lambda x : x[1], reverse = True)

    
    scores_set = set([x[1] for x in pagerank_subgraph_scores])
    max_score = max(scores_set)
    min_score = min(scores_set)
    
    steps = len(scores_set) - 1
    count = 1
    r_step = (124 - 69) / steps
    g_step = (223 - 145) / steps
    b_step = (218 - 80) / steps 
    
    for item in pagerank_subgraph_scores:
        if item[1] == max_score: color_gradients.append( (item[0], item[1], (124, 223, 80)) )
        elif item[1] == min_score: color_gradients.append( (item[0], item[1], (69, 145, 218)) )
        else:
            color_gradients.append( (item[0], item[1], (round(124 - r_step*count, 0), 
                                                        round(223 - g_step*count, 0), 
                                                        round(80 + b_step*count, 0))
                                    ) 
                                   )
            count += 1