# Precomupte

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import operator
from tqdm import tqdm
from networkx.algorithms.community import louvain_communities, asyn_fluidc, girvan_newman
import matplotlib.pyplot as plt
from operator import itemgetter
import math

In [2]:
# import graph data into a pandas dataframe
df_dblp = pd.read_csv(r"com-dblp.ungraph.txt", sep='\t')

# rename columns
df_dblp.rename(columns={'# FromNodeId': 'source', 'ToNodeId': 'target'}, inplace=True)

# create networkx graph
graph = nx.from_pandas_edgelist(df_dblp, "source", "target", create_using = nx.Graph)

In [2]:
#graph   = nx.karate_club_graph()

In [5]:
landmark_communities  = list(asyn_fluidc(graph, k = 50, seed=123))

# obtain the size of each community
community_sizes = np.array([len(comm) for comm in landmark_communities])

num_landmarks = [math.ceil(100*l/sum(community_sizes)) for l in community_sizes]

# obtain the degree centrality of all nodes in the graph
degr_centrality_ranking = dict(sorted(nx.degree_centrality(graph).items(), key=operator.itemgetter(1),reverse=True))

degr_centrality_of_communities = {}

distance_mapping2 = pd.DataFrame()
distance_mapping2["vertices"] = list(graph.nodes())
distance_mapping2 = distance_mapping2.set_index("vertices")

all_landmarks = []
cen_landmark = []
    
for i in tqdm(range(len(landmark_communities))):
    d = {key: degr_centrality_ranking.get(key) for key in landmark_communities[i]}

    num_l = num_landmarks[i]

    # rank the nodes in the community based on degree centrality
    d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True)[:num_l])
    
    all_landmarks.append(list(d.keys()))
    

    degr_centrality_of_communities[i] = d 

 





100%|█████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 299.33it/s]


In [6]:
#distance between communities,indexes are target communities, headers are source communities
df_cd = pd.DataFrame()
l=0
for n in tqdm(all_landmarks):
    df_cd[str(l)]=[nx.shortest_path_length(graph, source=n[0], target=m[0]) for m in all_landmarks ]
    l += 1  

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 78.35it/s]


In [7]:
#Generate a dataframe for each community,indexes are target nodes, headers are landmarkers: df_0,df_1,df_2......df_n,n=len(landmark_communities)-1
for i in tqdm(range(len(landmark_communities))):
#for i in range(2):
    exec ("df_%s = pd.DataFrame()"%i)
    exec ("df_%s['vertices']=list(landmark_communities[i])"%i)
    for landmark in all_landmarks[i]:
        exec ("df_%s[str(landmark)] = [nx.shortest_path_length(graph, source=landmark, target=m) for m in landmark_communities[i]]"%i)
    exec ("df_%s = df_%s.set_index('vertices')"%(i,i))

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [03:20<00:00,  4.02s/it]


# Generate test dataset

In [49]:
from random import sample
l = list(graph.nodes())

In [39]:
tmp=[]
i=0
while(i<1000):
    a = sample(l, 2)
    if nx.has_path(graph,a[0],a[1]):
        a += [nx.shortest_path_length(graph, source=a[0], target=a[1])]
        tmp.append(a)
        i += 1

In [44]:
df = pd.DataFrame(tmp,index=None, columns=['Source','Target','Distance'])

In [48]:
df.to_csv("dblp_test.csv",index=None)