## graph measures

In [9]:
import networkit
import pandas as pd
import datetime
import pickle
import os.path
import itertools

def change_pickle_protocol(filepath,protocol=2):
    with open(filepath,'rb') as f:
        obj = pickle.load(f)
    with open(filepath,'wb') as f:
        pickle.dump(obj,f,protocol=protocol)

def get_graph_measures(max_cc_gcommon):
    summary = {}
    print(str(datetime.datetime.now()))
    diam = networkit.distance.Diameter(max_cc_gcommon, networkit.distance.DiameterAlgo.Exact).run().getDiameter()
    print("diam", diam[0])
    summary["diameter"] = diam[0]

    print(str(datetime.datetime.now()))
    eff_diam_max_cc = networkit.distance.EffectiveDiameter(max_cc_gcommon).run().getEffectiveDiameter()
    print("eff_diam", eff_diam_max_cc)
    summary["eff_diameter"] = eff_diam_max_cc

    print(str(datetime.datetime.now()))
    communities = networkit.community.detectCommunities(max_cc_gcommon)
    modularity = networkit.community.Modularity(max_cc_gcommon).getQuality(communities, max_cc_gcommon)
    print("communities (num, max, avg)", communities.numberOfSubsets(), max(communities.subsetSizes()), 1.0*sum(communities.subsetSizes())/len(communities.subsetSizes()))
    print("modularity", modularity)
    summary["communities_num"] = communities.numberOfSubsets()
    summary["communities_max_size"] = max(communities.subsetSizes())
    summary["communities_avg_size"] = 1.0*sum(communities.subsetSizes())/len(communities.subsetSizes())
    summary["communities_rel_max_size"] = 1.0*max(communities.subsetSizes())/communities.numberOfElements()
    summary["communities_rel_avg_size"] = (1.0*sum(communities.subsetSizes())/len(communities.subsetSizes()))/communities.numberOfElements()
    summary["communities_modularity"] = modularity

    print(str(datetime.datetime.now()))
    dc = sorted(networkit.centrality.DegreeCentrality(max_cc_gcommon, normalized=True).run().scores(), reverse=True)
    num_nodes_high_dc = len(dc)
    for i in range(len(dc)):
        if dc[i]<(dc[0]*0.1):
            num_nodes_high_dc = i+1
            break
    print("degree_centrality (max, avg, num_high, rel_num_high)", dc[0], 1.0*sum(dc)/len(dc), num_nodes_high_dc, 1.0*num_nodes_high_dc/len(dc))
    summary["max_degc"] = dc[0]
    summary["avg_degc"] = 1.0*sum(dc)/len(dc)
    summary["num_nodes_high_degc"] = num_nodes_high_dc
    summary["rel_num_nodes_high_degc"] = 1.0*num_nodes_high_dc/len(dc)

    print(str(datetime.datetime.now()))
    btw = networkit.centrality.ApproxBetweenness(max_cc_gcommon).run()
    btwc = sorted(btw.scores(), reverse = True)
    summary["max_btwc"] = btwc[0]
    summary["avg_btwc"] = 1.0*sum(btwc)/len(btwc)
    high_btwc = len(list(x for x in btwc if x>0))
    summary["num_nodes_high_btwc"] = high_btwc
    summary["rel_num_nodes_high_btwc"] = 1.0*high_btwc/len(btwc)
    print("betwenness centrality (max, avg, high, rel_high)", summary["max_btwc"], summary["avg_btwc"], summary["num_nodes_high_btwc"], summary["rel_num_nodes_high_btwc"])

    print(str(datetime.datetime.now()))
    eig = networkit.centrality.EigenvectorCentrality(max_cc_gcommon).run()
    eigc = sorted(eig.scores(), reverse = True)
    summary["max_eigc"] = eigc[0]
    summary["avg_eigc"] = 1.0*sum(eigc)/len(eigc)
    high_eigc = len(list(x for x in eigc if x>0.01))
    summary["num_nodes_high_eigc"] = high_eigc
    summary["rel_num_nodes_high_eigc"] = 1.0*high_eigc/len(eigc)
    print("eigenvector centrality (max, avg, high, rel_high)", summary["max_eigc"], summary["avg_eigc"], summary["num_nodes_high_eigc"], summary["rel_num_nodes_high_eigc"])
    return summary
    

## example

In [None]:
#load graphlab.SGraph from csv
path_to_largest_connected_component = ""
df_v = pd.read_csv(path_to_largest_connected_component+"/vertices.csv")
df_e = pd.read_csv(path_to_largest_connected_component+"/edges.csv")
transf_id = dict(list((df_v["__id"][i], i) for i in range(len(df_v["__id"]))))
max_cc = networkit.graph.Graph(len(df_v["__id"]), directed=False)
for i in range(len(df_e["__src_id"])):
    max_cc.addEdge(transf_id[df_e["__src_id"][i]], transf_id[df_e["__dst_id"][i]])
summary = get_graph_measures(max_cc)

df_main = pd.DataFrame()
ss = pd.Series(summary, name="summary")
df_main = df_main.append(ss)
summ_ntwk = "path_to_new_pickle_file.pckl"
df_main.to_pickle(summ_ntwk)
change_pickle_protocol(summ_ntwk)

In [7]:
def main():
    period = ""
    summ_ntwk = "summary_networkit___.pckl"

    upfolder_cmn_and_ccs = "common_graphs_and_max_ccs"
    df_main = pd.DataFrame()
    if os.path.exists(summ_ntwk):
        df_main = pd.read_pickle(summ_ntwk)
    cities_prot = list(line.strip() for line in (open("protest_small_cities_unique_names_ids.lst", "r")))
    cities_non_prot_w = list(line.strip() for line in (open("nonprotest_small_cities_unique_names_weather_ids.lst", "r")))
    cities_non_prot_nonw = ["54"]
    cities = list(itertools.chain(cities_prot, cities_non_prot_w, cities_non_prot_nonw))
    sm_cities = pd.read_stata("cities below 100 with population.dta")
    for city in cities:
        try:
            print(city)
            city_str = str(city)
            if("city_id" in df_main)and(int(summary["city_id"]) in list(map(int,df_main["city_id"].tolist()))):
                continue
            print("analysis of the city %s" % city_str)
            print(str(datetime.datetime.now()))
            df_v = pd.read_csv(upfolder_cmn_and_ccs+"/max_cc_comn_"+period+("city_%s/vertices.csv" % city_str))
            df_e = pd.read_csv(upfolder_cmn_and_ccs+"/max_cc_comn_"+period+("city_%s/edges.csv" % city_str))
            transf_id = dict(list((df_v["__id"][i], i) for i in range(len(df_v["__id"]))))
            max_cc_gcommon = networkit.graph.Graph(len(df_v["__id"]), directed=False)
            for i in range(len(df_e["__src_id"])):
                max_cc_gcommon.addEdge(transf_id[df_e["__src_id"][i]], transf_id[df_e["__dst_id"][i]])
            print("graph was loaded, city %s" % city_str)
            summary = get_graph_measures(max_cc_gcommon)
            summary["city_id"] = int(city)

            print(str(datetime.datetime.now()))
            ss = pd.Series(summary, name="summary")
            if(not "city_id" in df_main)or(not int(summary["city_id"]) in list(map(int,df_main["city_id"].tolist()))):
                df_main = df_main.append(ss)

            df_main.to_pickle(summ_ntwk)
            change_pickle_protocol(summ_ntwk)

        except:
            continue
            

In [None]:
main()