In [None]:
from graph import *
from sknetwork.ranking import PageRank, top_k
from sknetwork.visualization import svg_graph
import numpy_ml.utils.graphs as ngr
from scipy import sparse
import pandas as pd
import numpy as np

In [2]:
def find_empty_references(dataset):
    empty_list = []
    current = 1
    ids = set(dataset["id"])
    
    for paper_id in ids:
        percentage = (current / len(dataset.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(dataset.loc[dataset["id"] == paper_id].iloc[0])
        references = current_paper[7]
        if len(references) == 0:
            empty_list.append(paper_id)
        current += 1
    return empty_list

def find_citations(dataset, list):
    empty_list = []
    current = 1
    ids = set(dataset["id"])
    
    for item in list:
        percentage = (current / len(list) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        flag = True
        for paper_id in ids:
            current_paper = tuple(dataset.loc[dataset["id"] == paper_id].iloc[0])
            references = current_paper[7]
            if item not in references: flag = False
        if not flag: empty_list.append(item)
        current += 1
    return empty_list
    
def remove_from_dataset(dataset, list):
    print(f"Dataset row before the elimination: {len(dataset.axes[0])}")
    for item in list:
        dataset = dataset.drop(dataset[dataset["id"] == item].index)
    print(f"Dataset row after the elimination: {len(dataset.axes[0])}")
    return dataset

In [3]:
def create_citation_graph(dataset):
    graph = Graph()
    ids = set(dataset["id"])    
    current = 1
    
    paper_in_dataset = set()
    for paper_id in ids:
        paper_in_dataset.add(str(paper_id))
        graph.add_node(str(paper_id))
        
    
    for paper_id in ids:
        percentage = (current / len(dataset.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(dataset.loc[dataset["id"] == paper_id].iloc[0])
        references = current_paper[7]
        for reference_id in references:
            if reference_id in paper_in_dataset:
                graph.add_edge(str(paper_id), reference_id)

        current += 1
    
    return graph

In [None]:
paper_dataframe = pd.read_json("data/nlp_papers.json")
lista = find_empty_references(paper_dataframe)
paper_dataframe = remove_from_dataset(paper_dataframe, lista)

In [4]:
paper_dataframe = pd.read_json("data/nlp_papers.json")
citation_graph = create_citation_graph(paper_dataframe)

Current percentage:  100.00%

In [None]:
dictionary = citation_graph.get_dict()

citation_counts = 0
for key, value in dictionary.items():
    citation_counts += len(value)
    print(f"Paper with id -> {key}\t\tCitation counts -> {len(value)}")
print(citation_counts)

In [5]:
def pagerank_results(graph: ngr.DiGraph):
    ranking = []
    pagerank = PageRank()
    
    adj_matrix = graph.to_matrix()
    scores = pagerank.fit_transform(adj_matrix)
    
    vertices = graph.vertices
    for index, item in enumerate(vertices):
        ranking.append( (item, scores[index]) )
        
    ranking.sort(key = lambda x: x[1], reverse = True)
    return ranking

In [None]:
nodes = []
edges = []

dictionary = citation_graph.get_dict()
for key in dictionary.keys():
    nodes.append(key)

for couple in citation_graph.get_edges():
    edge = ngr.Edge(fr = couple[0], to = couple[1], w = None)
    edges.append(edge)
    
numpy_graph = ngr.DiGraph(V = nodes, E = edges)

pagerank = PageRank()
scores = pagerank.fit_transform(numpy_graph.to_matrix())


#ranking = pagerank_results(numpy_graph)
#for item in ranking:
#    print(item)