In [1]:
import ijson
import heapq
import networkx as nx
import pandas as pd
import itertools
from itertools import combinations
from collections import Counter

import pickle
import heapq


In [2]:
file = open("citation.pkl",'rb')
citation_data = pickle.load(file)
file.close()

file = file = open("collaboration.pkl",'rb')
collaboration_data = pickle.load(file)
file.close()

Defining class graph for next functionalities

In [3]:
class graph:
    def __init__(self, V: list, E: list, attributes: dict, directed: bool):
        self.nodes = V
        self.edges = E
        self.edges_attributes = attributes
        
        # Flag to indicate if the graph is directed
        self.isdirected = directed
    
    def set_edges_attribute(self, edges, values, name=None):
        attribute_dict = dict.fromkeys(self.edges, None)
        for (edge, value) in zip(edges, values):
            attribute_dict[edge] = value
            self.edges_attributes.update({name : attribute_dict})
    
    def extract_subgraph(self, subgraph_nodes):
        # The subgraph will be directed iff the original graph is directed
    
        # retrieve edges with both end in the list of nodes of the subgraph
        subgraph_edges = [edge for edge in self.edges if (edge[0] in subgraph_nodes and edge[1] in subgraph_nodes) ]

        # Update attributes for the subraph
        subgraph_attributes = dict()
        for attribute in self.edges_attributes: 
            original_attribute = self.edges_attributes[attribute]
            subgraph_attributes[attribute] = {edge: original_attribute[edge] for edge in subgraph_edges}

        subgraph = graph(subgraph_nodes, subgraph_edges, subgraph_attributes, self.isdirected)
        return subgraph

    def remove_edges(self, edges_todel_list):
        # old edges
        edges = self.edges

        # list containing the old edges without the ones to delete
        new_edges = [edge for edge in edges if edge not in edges_todel_list]
        
        # set new edges as 'edges' attribute of the graph
        self.edges = new_edges

    def get_neighborhood(self, vertex): # Different cases if the graph is directed or not
        if self.isdirected:
            neigborhood = [edge[1] for edge in self.edges if edge[0] == vertex]
            return neigborhood
        
        # not directed case
        neigborhood = [edge[1] for edge in self.edges if edge[0] == vertex] + [edge[0] for edge in self.edges if edge[1] == vertex]
        neigborhood = list(set(neigborhood))

        return neigborhood
    
    def neighborhood_withedges_onlyundirected(self, vertex):
        if not self.isdirected:
            neigborhood_withedges = {edge[1]: edge for edge in self.edges if edge[0] == vertex}
            neigborhood_withedges.update({edge[0]: edge for edge in self.edges if edge[1] == vertex})

            return neigborhood_withedges

    def indegree(self, node):
        if self.isdirected:
            indegree = len([edge for edge in self.edges if edge[1] == node])
            return indegree
        
        
    def to_directed(self):
        if self.isdirected:
            return self
        
        new_edges = set(edge[::-1] for edge in self.edges)
        total_edges = list(set(self.edges).union(new_edges))
        self.edges = total_edges

        new_edges = list(new_edges)
        updated_attributes = dict()
        for attribute in self.edges_attributes:
            attr_dict = self.edges_attributes[attribute]
            new_edges_attr = dict({new_edge: attr_dict[new_edge[::-1]] for new_edge in new_edges})
            new_edges_attr.update(attr_dict)
            updated_attributes.update( { attribute: new_edges_attr } )

        self.edges_attributes = updated_attributes
        self.isdirected = True
        return self
    
    def copy(self):
        return graph(self.nodes, self.edges, self.edges_attributes, self.isdirected)
    

Define functions to extract top N papers and top N authors.

- TopN papers: N papers with more citations
- TopN authors: N authors with more publications

How to retriev this info: 
- in the collaboration graph (authors' graph), the total number of publications of each author an be retrieved as the sum of the weigths adjacent to the author's node; then we can store these info and retrieve the topN authors with a heap structure, then induce a subgraph and work on it.
- in the citation graph (papers' graph), the number of citation of each paper can be retrieved as the indegree of the node correspinding to it; then heap + topN + subgraph.

In [4]:
def extract_topN(G, N, flag):

    # Function to retrieve the top N papers wrt number of citations
    def topN_papers(G, N):
        # create list of tuples (node, indegree(node)) for heapq
        nodes_indegree = []
        for node in G.nodes: 
            nodes_indegree.append((node, G.indegree(node)))
        
        # retrieve topN papers
        nlargest = heapq.nlargest(N, nodes_indegree, key= lambda t: t[1])

        # return only the nodes with the most  number of citations
        return [t[0] for t in nlargest]

    # Function to retrieve top N authors wrt their total number of publications
    def topN_authors(G, N):

        # initialize empty list 
        n_publications = []

        # Retrieve all the edges' weigths of the graph
        all_weigths = G.edges_attributes['weigths']

        # retrieve number of publications for each node
        #  dictionary {node: {Neighbor: edge between neighbor and node} for all the neighbors of the node}
        all_nodes_neigh_withedges = dict({node: G.neighborhood_withedges_onlyundirected(node) for node in G.nodes})
        
        for node in G.nodes: 
            # Retrieve info on the current node
            current_node_info = all_nodes_neigh_withedges[node]

            # Construct dictionary {Neighbor: weigth of the edge between neighbor and node} for all the neighbors of the node:
            # Initialize dictionary with all the weigths set to 0
            neigh_weigths_dict = dict.fromkeys(current_node_info.keys(), 0)
            # Update the dictionary with the correct weights
            neigh_weigths_dict.update({neigh: all_weigths[current_node_info[neigh]] for neigh in current_node_info if current_node_info[neigh] in all_weigths})

            # Update the list of the number of publications for each node
            n_publications.append( ( node, sum( neigh_weigths_dict.values() ) ) )

        # Retrieve top N authors wrt their total number of publications
        nlargest = heapq.nlargest(N, n_publications, key= lambda t: t[1])

        # Return only the nodes (without number of publications)
        return [t[0] for t in nlargest]


    if flag == 'authors':
        topN = topN_authors(G, N)
    elif flag == 'papers': 
        topN = topN_papers(G, N)
    else:
        return 'invalid flag'
    
    return topN


2.3 Shortest path

- On the collaboration graph
- start from source, end in sink and pass thoroguh all the nodes in the sequence in order 

IDEA: the shortest path that connects all the nodes in the sequence in order is the one that minimizes the number of steps for each intermediate stop to be done. Indeed, we want to minimize $\sum_P \sum_{e \in E}{\mathbb{1}_{ \left( e \in P \right)}(e)} = \sum_{i=1}^{n-1}  \sum_{e \in E}{\mathbb{1}_{\left( e \in P_i \right)}(e)}$ where P is a path from source to sink that goes throguh the sequence, $P_i$ is a path that goes from node $a_i$ to $a_{i+1}$ for each $i$, which ordered sequence returns exactly the path $P$ (note that such decomposition is always doable). 

il problema di minimo è separabile (si dice effettivamente così?) proprio per la necessità di mantenere l'ordine dei nodi nella sequenza.

Shortest path intermedio implementato con Dijkistra

In [5]:
def shortest_path(G, source, sink):    
# Initialize dictionary containing shortest paths to each node
    paths = dict.fromkeys(G.nodes, [])
    paths.update({source: [[]]})

    #initialize BFS' exploration dictionary using distances from the source
    distances = dict.fromkeys(G.nodes, float('inf'))

    # set distance to 0 for the first node and add it to a queue
    distances.update({source: 0})
    q = [source]

    while q != []:
        # Extract node to explore from the queue
        parent = q.pop(0)

        # Retrieve neighborhood and "connecting edges"
        neighborhood_edges = G.neighborhood_withedges_onlyundirected(parent)
        for u in neighborhood_edges.keys():
            # If the node has never been visited, set distance from the starting node
            if distances[u] == float('inf'):
                distances[u] = distances[parent] + G.edges_attributes['weigths'][ neighborhood_edges[u] ]

                q.append(u)
                
            # Update number of shortest paths to u 
            if distances[u] == distances[parent] + G.edges_attributes['weigths'][ neighborhood_edges[u] ]:
                paths.update({u: paths[u] + [path + [(parent, u)]] for path in paths[parent]})
    
    if (distances[sink] < float('inf')):
        return (distances[sink] < float('inf')), distances[sink], paths[sink][0]
    else:
        return (distances[sink] < float('inf')), distances[sink], []

def shortestpath_sequence(sequence: list, first_node, last_node, N, G_data=collaboration_data):
    # extract graph class instance from the data in input 
    G = graph(G_data['nodes'], G_data['edges'], {'weigths': G_data['weigths']}, G_data['dir'])

    # extract top N authors 
    topN_list = extract_topN(G, N, 'authors')

    # subgraph of the top N authors
    G_N = G.extract_subgraph(topN_list)


    # Initialize path and path cost
    total_path = []
    total_cost = 0

    sequence = [first_node] + sequence + [last_node]
    
    # check the existence of path P_i and compute shortest path and shortest path's cost if it exists
    for i in range(len(sequence) - 1):
        connected, path_cost, path = shortest_path(G_N, sequence[i], sequence[i+1])
        if not connected: 
            return "There is no such path."
        
        # Update path and path's cost if the graph is connected
        total_path += path
        total_cost += path_cost
    
    return total_cost, total_path

In [6]:
# top 5 autori
top1 = 332422508
top2 = 173839695
top3 = 2111642879
top4 = 2146468246
top5 = 2133227394

In [7]:
seq = [top2, top3]

shortestpath_sequence(seq, top5, top1, 5)

(22,
 [(2133227394, 173839695), (173839695, 2111642879), (2111642879, 332422508)])