In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import operator
from tqdm import tqdm
import math

# import graph data into a pandas dataframe
df_dblp = pd.read_csv(r"com-dblp.ungraph.txt", sep='\t')

# rename columns
df_dblp.rename(columns={'# FromNodeId': 'source', 'ToNodeId': 'target'}, inplace=True)

# create networkx graph
g_dblp = nx.from_pandas_edgelist(df_dblp, "source", "target", create_using = nx.Graph)

In [2]:
def get_graph_statistics(g):
    n_nodes = g.number_of_nodes()
    n_edges = len(list(g.edges()))
    density = nx.density(g)
    avg_clustering = nx.average_clustering(g)

    
    stats = pd.DataFrame()
    stats["name"] = ["n_nodes", "n_edges","density","avg_clustering"]
    stats["values"] = [n_nodes, n_edges, density, avg_clustering]
    
    return stats
    
stats = get_graph_statistics(g_dblp)

## Strategy 1 - Iteratively removing covered nodes

In [3]:
def strategy1(graph, alpha = 0.00005):
    """
    This function selects landmarks and computes the distances from each landmark to every reachable node in the graph.
    
    :param graph (nx.Graph): the networkx graph created from the edgelist
    :param alpha (float): landmark scaling factor. This variable controls the number of landmarks selected at every iteration.
                          The number of landmarks selected is computed as following: N = int(num_remaining_nodes * alpha)

    :returns:
      - distance_mapping (pandas.core.frame.DataFrame): a pandas DataFrame having as index column the list of all nodes in
                                                        the graph. Each column other than the index column is denoted by a
                                                        landmark and contains the distance from the landmark to every other node.
                                                        If a node is not reachable by a landmark, the value in the cell will 
                                                        be NaN.
                                
    """
    
    # get the nodelist
    nodelist = np.array(list(graph.nodes())) # [CODE REVIEW] You don't use this variable anywhere?
    
    # initialize set of landmarks
    landmark_nodes = []

    # rank the nodes based on degree centrality
    ranking = dict(sorted(nx.degree_centrality(graph).items(), key=operator.itemgetter(1),reverse=True))
    
    # [CODE REVIEW] 
    # Why do you convert it to a cit here and then the next line convert it 
    # to a list since you dont use the dict anywhere?

    # create a list of nodes ordered by their degree centrality
    ranking_list = list(ranking)
    

    current_n_landmarks = 0
    
    # this variable will contain the distances between landmarks and all other nodes
    # if a node is not reachable, the distance will be set to NaN
    distance_mapping = pd.DataFrame()
    distance_mapping["vertices"] = list(graph.nodes())
    distance_mapping = distance_mapping.set_index("vertices")
    
    while len(ranking_list) > 0:
        # [CODE REVIEW] What is "u_list"? Variable name isnt very clear, or could clarify with a comment
        u_list = ranking_list[:int(len(ranking_list)*alpha)]
        
        for u in tqdm(u_list):
            # add u to the list of landmark nodes
            landmark_nodes.append(u)
            
            # get the distance from "u" to all other nodes
            shortest_path_lengths = nx.single_source_shortest_path_length(graph, u)
            sp_array = np.array(list(shortest_path_lengths.items()))

            # [CODE REVIEW]
            # If runtime on larger datasets becomes an issue you can try looking at using
            # igraph to do this. It has a faster implementation (in C) of single source shortest path
            # https://igraph.org/python/doc/api/igraph._igraph.GraphBase.html#get_shortest_paths

            
            # get an array of reached nodes
            reached_nodes = list(shortest_path_lengths.keys())
            recorded_distances  = list(shortest_path_lengths.values())
            
            df_u = pd.DataFrame()
            df_u["vertices"] = reached_nodes
            df_u[str(u)] = recorded_distances
            
            distance_mapping = distance_mapping.join(df_u.set_index("vertices"))
            
            # compute the average distance
            average_distance = sp_array[:, 1].mean()

            # get the nodes within average distance
            nodes_in_range = list(sp_array[np.where(sp_array[:,1] <= average_distance)[0], :][:,0])

            # remove the nodes that are within average distance
            updated_ranking_list = set(ranking_list) - set(nodes_in_range)
            ranking_list = list(updated_ranking_list)
                

        new_n_landmarks = len(landmark_nodes)
        
        if new_n_landmarks == current_n_landmarks:
            break
        else:
            current_n_landmarks = new_n_landmarks

    return distance_mapping

In [4]:
distance_mapping = strategy1(g_dblp, alpha = 0.00008)

100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:33<00:00,  1.32s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.23s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.27s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.26s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.32s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.27s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.26s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.35s/it]
100%|███████████████████████████████████

In [51]:
import math
def distance(n1,n2):
    U = (distance_mapping.loc[n1,:]+distance_mapping.loc[n2,:]).min()
    L = (distance_mapping.loc[n1,:]-distance_mapping.loc[n2,:]).abs().max()
    return U,L,(L+U)/2,math.sqrt(L*U)

In [19]:
df_test = pd.read_csv(r'dblp_test.csv',sep=',')

In [62]:
estimate = []
for i in range(1000):
    a = df_test.loc[i,:].tolist()
    estimate.append(distance(a[0],a[1]))

In [66]:
results = pd.DataFrame(estimate,index=None,columns=['U','L','MP','GM'])

In [35]:
from sklearn.metrics import mean_squared_error

In [74]:
for i in results.columns:
    print('MSE',' ',i, ':',mean_squared_error(df_test['Distance'].tolist(),results[i].tolist()))

MSE   U : 1.559
MSE   L : 14.833
MSE   MP : 2.467
MSE   GM : 4.61656641919919
