In [1]:
import os
import math
import time
import random
import numpy as np
import pandas as pd
import networkx as nx
import scipy.stats as stats
from itertools import combinations
from Robustness import *

### Global and local graph measure 

In [2]:
#------------------------------------------------------------------------
## Function for creating global and local graph measur
#------------------------------------------------------------------------

def Measure(file):
    '''
    Inputs: G: networkx graph, 
    file: Name of the real network 
    Outputs: Creates text files of Degree, ClusteringCoefficient, BetweennessCentrality, EigenVectorCentrality, 
    ClosenessCentrality and global measures for each graph G
    '''
    path = '../Data/Real_Networks/'
    
    f = pd.read_csv(path + f'{file}/{file}_Edge.txt', sep = '\t',header = None)
    G = nx.from_pandas_edgelist(f,0,1)
    print(G)
    N = pd.DataFrame(G.nodes())
    N.to_csv(path + f'{file}/{file}_Node.txt', sep = '\t', header = None, index = None)

    Deg = nx.degree(G)
    dfD = pd.DataFrame(Deg)
    dfD.to_csv(path + f'{file}/{file}_Degree.txt', sep = '\t', index = None,header = None)

    CC = nx.clustering(G)
    data_list = list(CC.items())
    dfC = pd.DataFrame(data_list)
    dfC.to_csv(path + f'{file}/{file}_ClusteringCoefficient.txt', sep = '\t', index = None,header = None)
    
    BC = nx.betweenness_centrality(G, seed = 1)
    data_list1 = list(BC.items())
    dfB = pd.DataFrame(data_list1)
    dfB.to_csv(path + f'{file}/{file}_BetweennessCentrality.txt', sep = '\t', index = None,header = None)
    
    EVC = nx.eigenvector_centrality(G, max_iter=5000)
    data_list2 = list(EVC.items())
    dfE = pd.DataFrame(data_list2)
    dfE.to_csv(path + f'{file}/{file}_EigenVectorCentrality.txt', sep = '\t', index = None,header = None)
    
    CloseC = nx.closeness_centrality(G)
    data_list3 = list(CloseC.items())
    dfCloseC = pd.DataFrame(data_list3)
    dfCloseC.to_csv(path + f'{file}/{file}_ClosenessCentrality.txt', sep = '\t', index = None,header = None)

    print(f'Node Measure done for {file}.')
    
    ## Global Measure
    outfile = open(path + f'{file}/{file}_Global.txt','w')
    outfile.write('Number_of_nodes\tNumber_of_edges\tFraction_of_nodes_inlcc\tAverage_degree\tEdge_density\tMean_shortest_path_length')
    outfile.write('\n')
    NN = G.number_of_nodes()
    EE = G.number_of_edges()
    largest_cc = max(nx.connected_components(G), key=len)
    lcc = list(map(int,largest_cc)) 
    G_lcc = G.subgraph(largest_cc)
    fr_lcc = len(lcc)/NN
    ave_deg = np.mean(dfD[1])
    edge_density = 2*EE/(NN*(NN-1))
    mean_spl = nx.average_shortest_path_length(G_lcc)
    outfile.write(f'{NN}\t{EE}\t{fr_lcc}\t{ave_deg}\t{edge_density}\t{mean_spl}')
    outfile.close()
    print(f'Global Measure done for {file}.')
    

### Functions for creating text files of nodes measure present in lcc

In [2]:
#-------------------------------------------------------------------------------------------------------
## Functions for creating text files of nodes measure present in lcc
#-------------------------------------------------------------------------------------------------------


def Node_minimum(nodefile,edgefile):
    '''
    Inputs: nodefile: dataframe of nodefile, edgefile: dataframe of edgefile, 
    Outputs: dataframe with four columns; c1:node name, c2: Minimum curvature value, c3: Sum of the curvature values,
    c4: Average of the curvature values
    '''
    nodes = list(nodefile)
    LISTS = {'nodes':[], 'minimum':[], 'sum':[], 'average':[]}
    for node in nodes:
        condition = ( edgefile[0] == node)
        result_list = edgefile.loc[condition, 2].tolist()
        LISTS['nodes'].append(node)
        LISTS['minimum'].append(min(result_list))
        LISTS['sum'].append(sum(result_list))
        LISTS['average'].append(np.mean(result_list))
    df = pd.DataFrame(LISTS)
    return df

def LCC(edgelist):
    '''
    Input: Takes pandas edgelist 
    Output: List of nodes present in the Largest connected component
    '''
    G = nx.from_pandas_edgelist(edgelist,0,1)
    largest_cc = max(nx.connected_components(G), key=len)
    lcc = list(map(int,largest_cc))
    return lcc

def nodes_measure_lcc(file,measure,lcc):
    '''
    Inputs: seed: file: real network's name,
    measure: Name of the graph measure,
    lcc: List of nodes present in the Largest connected component,
    Output: Pandas dataframe with nodes name in first colume and corresponding measure's value in the second column
    '''  
    if measure in Graph_measure:
        df1 = pd.read_csv(path + f'{file}/{file}_{measure}.txt', sep = '\t', header = None)
        df2 = df1[df1[0].isin(lcc)]
        return df2    
  
    elif measure == 'BakryEmery':
        df1 = pd.read_csv(path + f'{file}/{file}_{measure}_node.txt', sep = '\t', header = None)
        df1[0] = df1[0].astype(int)
        df2 = df1[df1[0].isin(lcc)]
        return df2
    
    else:
        if measure == 'Ollivier':
            df1 = pd.read_csv(path + f'{file}/{file}_{measure}_node_min.txt', sep = '\t', header = None)
            df1[0] = df1[0].astype(int)
            df2 = df1[df1[0].isin(lcc)]
            return df2
        else:
            df1 = pd.read_csv(path + f'{file}/{file}_{measure}_Node.txt', sep = '\t', header = None)
            dfe = pd.read_csv(path + f'{file}/{file}_{measure}_Edge.txt',sep = '\t', header = None)
            df1[0] = df1[0].astype(int)
            df2 = Node_minimum(df1[0],dfe)
            df2.to_csv(path + f'{file}/{file}_{measure}_node_min.txt', sep = '\t', header = None, index = None)
            df2 = df2[df2['nodes'].isin(lcc)]
            return df2
        
        
#-------------------------------------------------------------------------------------------------------
## Calculates Spearman and Preason Correlation between two measure
#-------------------------------------------------------------------------------------------------------

def correlation(file,measure1,measure2):
    '''
    Inputs: model: model's name, measure1: first vertex measure , measure2: second vertex measure, seed: total number to sample, path: directory;
    Outputs: dictionary with Spearman and Preason correlation between measure1 and measure2 corresponding to each seed and creates text files of correlations 
    '''
    path = '../Data/Real_Networks/'
    df1 = pd.read_csv(path + f'{file}/{file}_{measure1}_nodelcc.txt', sep = '\t', header = None)
    k1 = list(df1[0])
    v1 = list(df1[1])
        
    data_dict1 = {k: v for k, v in zip(k1, v1)}

    df2 = pd.read_csv(path + f'{file}/{file}_{measure2}_nodelcc.txt', sep = '\t', header = None)
    k2 = list(df2[0])
    v2 = list(df2[1])

    if len(k1) != len(k2):
        print(measure1,measure2,'\nLength of the two input lists are not same')
        return measure1,measure2,'\nLength of the two input lists are not same'
        
    data_dict2 = {k: v for k, v in zip(k2, v2)} 

    set1 = list(data_dict1.values())
    set2 = [data_dict2[i] for i in data_dict1.keys()]
    df = pd.DataFrame()
    df[1] = set1
    df[2] = set2
    if not os.path.exists(path + f'{file}/Correlation_Example'):
        os.mkdir(path + f'{file}/Correlation_Example')
    df.to_csv(path + f'{file}/Correlation_Example/{file}_corr_{measure1}&{measure2}_nodelcc.txt', sep = '\t', header = None, index = None)
    print(f'{measure1} & {measure2} Done!')
    
    spear, p = stats.spearmanr(set1, set2)
    pear, p = stats.pearsonr(set1, set2)

    return spear, pear


In [4]:
Graph_measure = ['Degree', 'ClusteringCoefficient','BetweennessCentrality','EigenVectorCentrality','ClosenessCentrality']
Curv_measure = ['BakryEmery','Forman','AugForman','Ollivier']
all_measures = Curv_measure + Graph_measure

path = '../Data/Real_Networks/'
Networks = [file for file in os.listdir(path) if '.' not in file]
print(len(Networks))

Networks = ['ArenasEmail']

20


In [5]:
#----------------------------------------------------------------------------
## Call the function "measure" to generate local and global graph measures
#----------------------------------------------------------------------------

for file in Networks:
    t1 = time.time()
    infile = Measure(file)
    print(f'------------------ {file} Done!   Total time: ',time.time()-t1, '--------------------------------------')

Graph with 1133 nodes and 5451 edges
Node Measure done for ArenasEmail.
Global Measure done for ArenasEmail.
------------------ ArenasEmail Done!   Total time:  23.33251714706421 --------------------------------------


In [6]:
#-----------------------------------------------------------
## Creates text files of nodes measure present in lcc
#----------------------------------------------------------

for file in Networks:
    p1 = os.listdir(path+f'{file}/')
    df = pd.read_csv(path + f'{file}/{file}_Edge.txt', sep = '\t',header = None)
    lcc = LCC(df)
    for measure in all_measures:
        lccmeasure = nodes_measure_lcc(file,measure,lcc)
        lccmeasure.to_csv(path + f'{file}/{file}_{measure}_nodelcc.txt', sep = '\t', header = None, index = None)
        print(f'Done for {measure}')
    print(f'----------------------------- Done for {file} ------------------------------------------------')

Done for BakryEmery
Done for Forman
Done for AugForman
Done for Ollivier
Done for Degree
Done for ClusteringCoefficient
Done for BetweennessCentrality
Done for EigenVectorCentrality
Done for ClosenessCentrality
----------------------------- Done for ArenasEmail ------------------------------------------------


### Calculates Correlation between two vertex measures

In [7]:
#-----------------------------------------------------------------------
## Creates text files of correlation between two vertex measures
#-----------------------------------------------------------------------

m1 = 'BakryEmery'
for file in Networks:
    p1 = os.listdir(path+f'{file}/')
    corr = {'name':[],'Spearman':[],'Pearson':[]} 
    #com = list(combinations(all_measures, 2))
    #for (m1,m2) in com:        
    for m2 in list(set(all_measures)-{'BakryEmery'}):
        cor = correlation(file,m1,m2)
        corr['name'].append(f'{m1} & {m2}')
        corr['Spearman'].append(cor[0])
        corr['Pearson'].append(cor[1])
    df = pd.DataFrame(corr, index = None) 
    df.to_csv(path + f'{file}/Correlation_Example/{file}_correlation.txt',sep = '\t', index = None )
    print(f'----------------------------- Done for {file} ------------------------------------------------')

BakryEmery & AugForman Done!
BakryEmery & ClosenessCentrality Done!
BakryEmery & EigenVectorCentrality Done!
BakryEmery & Ollivier Done!
BakryEmery & Forman Done!
BakryEmery & ClusteringCoefficient Done!
BakryEmery & BetweennessCentrality Done!
BakryEmery & Degree Done!
----------------------------- Done for ArenasEmail ------------------------------------------------


### Robustness of the vertices

In [None]:
Graph_measure = ['Degree', 'ClusteringCoefficient','BetweennessCentrality','EigenVectorCentrality','ClosenessCentrality']
Curv_measure = ['BakryEmery','Forman','AugForman','Ollivier']
ALL_Measures = Curv_measure + Graph_measure + ['Random']

name = 'BakryEmery'
path = f'../Data/Real_Networks/ArenasEmail/'
if not os.path.exists(path + 'Robustness_Example'):
    os.mkdir(path + 'Robustness_Example')
    
nodefile = pd.read_csv(path + 'ArenasEmail_Node.txt', sep = '\t', header = None)
edgefile = pd.read_csv(path + 'ArenasEmail_Edge.txt', sep = '\t', header = None)
edgefile = edgefile.astype(int)
G = nx.from_pandas_edgelist(edgefile,0,1)
G.add_nodes_from(nodefile[0])


df = pd.read_csv(path + 'ArenasEmail_BakryEmery_node.txt',sep = '\t', header = None)
measure_value = {df[0][i]:df[1][i] for i in range(len(df))}
sort_order = False

#------------------------------------------------------------------------------------------
# Call the function 'Robustness_node' to calculate the robustnesss
#
# If name in Graph_measure: sort_order = True
# If name in Curv_measure:  sort_order = False   
#
# For random vertex removal, call the function 'Robustness_random'
#------------------------------------------------------------------------------------------

Robust = Robustness_node(measure_value,G,sort_order)
    
df2 = pd.DataFrame(Robust)
Robustfile = df2.T
Robustfile.columns = ['Fraction_of_nodes','Efficiency']
Robustfile.to_csv(path + 'Robustness_Example/Robustness_BakryEmery.txt', sep = '\t',index = None)
print(' --------------------- Done for',name,'----------------------------------------------')