## graph measures

In [None]:
#scipy.sparse.csgraph
import graphlab
from graphlab import SGraph
graphlab.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import os.path
import itertools

def get_graph_measures(mgraph):
    summary = {}
    suppl = {}
    summary["num_nodes"] = int(len(mgraph.vertices))

    def in_degree(graph, top_num):
        dgcount = graphlab.degree_counting.create(graph)
        dgc_vert = dgcount['graph'].vertices.sort("in_degree", False)
        top_set = dgc_vert["__id"][0:top_num]
        return dgcount, top_set

    dgcount, top_set_dg = in_degree(mgraph, top_num = 50)

    def convert_graphlab_to_nx(graph, di):
        nxgraph = nx.DiGraph() if di else nx.Graph()
        nxgraph.add_nodes_from(list(graph.vertices['__id']))
        nxgraph.add_edges_from([(e['__src_id'], e['__dst_id']) for e in graph.edges])
        return nxgraph

    nxmgraph = convert_graphlab_to_nx(mgraph, di=False)
    nxdeg = nxmgraph.degree()

    summary["in_degree_avg"] = 1.0*sum(dgcount["graph"].vertices["in_degree"])/len(dgcount["graph"].vertices)       
    summary["in_degree_max"] = max(dgcount["graph"].vertices["in_degree"])        
    summary["degree_avg_undirected"] = 1.0*sum(nxdeg.values())/len(nxdeg)
    summary["degree_max_undirected"] = max(nxdeg.values())
    suppl["top_in_degree"] = top_set_dg
    
    def pagerank(graph, top_num):
        pr = graphlab.pagerank.create(graph)
        pr_sort = pr['pagerank'].sort("pagerank", False)
        top_set = pr_sort["__id"][0:top_num]
        return pr, top_set

    prank, top_pagerank = pagerank(mgraph, top_num = 100)
    suppl["top_pagerank"] = top_pagerank

    gcolor = graphlab.graph_coloring.create(mgraph)
    summary["upper_bound_num_colors_shares"] = gcolor.num_colors

    def connected_components(graph, visualisation = False):
        cc = graphlab.connected_components.create(graph)
        cc_sort = cc.component_size.sort("Count", False)
        #print(cc.summary())
        if(visualisation):
            cc_grouped = cc_sort.groupby("Count", {"total" : graphlab.aggregate.COUNT})
            cc_grouped = cc_grouped.sort("Count")
            y_pos = range(cc_grouped.num_rows())
            plt.barh(y_pos, cc_grouped["total"], align='center', alpha=0.8)
            plt.yticks(y_pos, cc_grouped["Count"])
            plt.xlabel("number of components")
            plt.title("number of components of each size")
            plt.show()
        max_component_size = cc_sort["Count"][0]
        graph.vertices['component_id'] = cc['graph'].vertices['component_id']
        targets = cc["graph"].get_vertices(fields={"component_id":cc_sort["component_id"][0]})["__id"]
        max_component = graph.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
        return cc, max_component, max_component_size

    cc, max_cc, max_cc_size = connected_components(mgraph, visualisation = True)
    suppl["max_cc"] = max_cc
    summary["max_cc_size"] = max_cc_size
    summary["max_cc_relative_size"] = 1.0*max_cc_size/mgraph.summary()['num_vertices']

    tc = graphlab.triangle_counting.create(max_cc)
    tc_num = tc['num_triangles']
    tcount = tc['triangle_count']
    #print(tc_num)
    #tcount = tcount.sort('triangle_count', False)
    #print(tcount["triangle_count"][0]) - maximal number of triangles for a node
    tcount_not_in_triangles = tcount.filter_by(0, "triangle_count")
    summary["num_triangles_in_max_cc"] = tc_num
    summary["relnum_nodes_form_triangles_in_cc"] = tcount.num_rows() - tcount_not_in_triangles.num_rows()

    def k_core(graph):
        kc = graphlab.kcore.create(graph,0,1000)
        kcore_id = kc['core_id']
        max_k = max(kcore_id["core_id"])
        kcore_id["max"] = [True if (kcore_id["core_id"][i] == max_k) else False for i in xrange(kcore_id.num_rows())]
        kcore_id = kcore_id.filter_by(True, "max")
        return max_k, kcore_id.num_rows()

    max_k, max_core_size = k_core(mgraph)
    summary["num_cores"] = max_k
    summary["rel_max_core_size"] = 1.0*max_core_size/max_cc_size

    nx_max_cc = convert_graphlab_to_nx(max_cc, di=False)
    avg_clust = nx.average_clustering(nx_max_cc)
    summary["avg_clust_coef"] = avg_clust

    return summary, suppl


## example

In [None]:
path = ""
mgraph = graphlab.load_sgraph(path)
summary, suppl = get_graph_measures(mgraph)
for v in summary:
    summary[v] = [summary[v]]
sf = graphlab.SFrame(summary)
sf

## combine with Networkit results

In [None]:
import cPickle
import graphlab
from graphlab import SGraph
graphlab.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd

summary_name = "summary_main"
summ_ntwk = "summary_networkit.pckl"
save_name = "small_cities_graph_measures"

sf_main = graphlab.load_sframe(summary_name)
#cities = list(line.strip() for line in (open("protest_small_cities_unique_names_ids.lst", "r")))
#sf_main["protest"] = [1 if (str(sf_main["city_id"][i]) in cities) else 0 for i in xrange(sf_main.num_rows())]
#sf_main.save(summary_name)
df_main = pd.read_pickle(summ_ntwk)

df_main["city_id"] = df_main["city_id"].astype(int)
sf_main1 = graphlab.SFrame(data=df_main)
sf_main = sf_main.join(sf_main1, how = "inner")
sf_main = sf_main.sort([("protest", False),("city_id", True)])
sf_main.save(save_name)
df_main = sf_main.to_dataframe()
cols = ['protest','city_id','city_name','city_pop_stata','num_nodes_in_common','num_nodes_in_comments','num_nodes_in_likes','num_nodes_in_shares','diameter','eff_diameter','in_degree_avg_common','in_degree_avg_comments','in_degree_avg_likes','in_degree_avg_shares','in_degree_max_common','in_degree_max_comments','in_degree_max_likes','in_degree_max_shares','degree_avg_undirected_common','degree_avg_undirected_comments','degree_avg_undirected_likes','degree_avg_undirected_shares','degree_max_undirected_common','degree_max_undirected_comments','degree_max_undirected_likes','degree_max_undirected_shares','num_nodes_from_top100_indegree_in_each_collection','max_cc_size_common','max_cc_size_comments','max_cc_size_likes','max_cc_size_shares','max_cc_relative_size_common','max_cc_relative_size_comments','max_cc_relative_size_likes','max_cc_relative_size_shares','num_triangles_in_max_cc_common','num_triangles_in_max_cc_comments','num_triangles_in_max_cc_likes','num_triangles_in_max_cc_shares','relnum_nodes_form_triangles_in_cc_common','relnum_nodes_form_triangles_in_cc_comments','relnum_nodes_form_triangles_in_cc_likes','relnum_nodes_form_triangles_in_cc_shares','avg_clust_coef_common','avg_clust_coef_comments','avg_clust_coef_likes','avg_clust_coef_shares','num_cores_common','num_cores_comments','num_cores_likes','num_cores_shares','rel_max_core_size_common','rel_max_core_size_comments','rel_max_core_size_likes','rel_max_core_size_shares','communities_num','communities_avg_size','communities_max_size','communities_rel_avg_size','communities_rel_max_size','communities_modularity','upper_bound_num_colors_common','upper_bound_num_colors_comments','upper_bound_num_colors_likes','upper_bound_num_colors_shares','avg_btwc','avg_degc','avg_eigc','max_btwc','max_degc','max_eigc','num_nodes_high_btwc','num_nodes_high_degc','num_nodes_high_eigc','rel_num_nodes_high_btwc','rel_num_nodes_high_degc','rel_num_nodes_high_eigc','num_nodes_from_top100_pagerank_in_each_collection']
df_main = df_main[cols]
df_main.to_csv(save_name+'.csv',index=False)
df_main.to_excel(save_name+'.xls',index=False)

In [None]:
def main():
    upfolder_cmn_and_ccs = "common_graphs_and_max_ccs"
    upfolder = "ready_sframes_sgraphs"
    period = ""
    summary_name = "summary_main"

    sf_main = graphlab.SFrame()
    if os.path.exists(summary_name):
        sf_main = graphlab.load_sframe(summary_name)

    cities_prot = list(line.strip() for line in (open("protest_small_cities_unique_names_ids.lst", "r")))
    cities_non_prot_w = list(line.strip() for line in (open("nonprotest_small_cities_unique_names_weather_ids.lst", "r")))
    cities_non_prot_nonw = ["54"]
    cities = list(itertools.chain(cities_prot, cities_non_prot_w, cities_non_prot_nonw))
    
    sm_cities = pd.read_stata("cities below 100 with population.dta")
    sm_cities["id"] = sm_cities["id"].astype(int)
    sm_cities["id_index"] = sm_cities["id"].apply(lambda x:x)
    sm_cities = sm_cities.set_index('id_index')
    for city in cities:
        try:
            print(str(city))
            
            if("city_id" in sf_main.column_names())and(summary["city_id"] in list(sf_main["city_id"])):
                print(str(city), "city has already been done")
                continue
            city_str = str(city)
            print("analysis of the city %s" % city_str)          
            
            gshares = graphlab.load_sgraph(upfolder+"\\shares_"+period+"sgraph_city_"+city_str)
            glikes = graphlab.load_sgraph(upfolder+"\\likes_"+period+"sgraph_city_"+city_str)
            gcomments = graphlab.load_sgraph(upfolder+"\\comments_"+period+"sgraph_city_"+city_str)
            fshares = graphlab.load_sframe(upfolder+"\\shares_"+period+"sframe_city_"+city_str)
            flikes = graphlab.load_sframe(upfolder+"\\likes_"+period+"sframe_city_"+city_str)
            fcomments = graphlab.load_sframe(upfolder+"\\comments_"+period+"sframe_city_"+city_str)
            print(str(city), )
            fshares.rename({"owner_id":"from_id", "copy_history_owner_id":"to_id"})
            flikes.rename({"from_id":"from_id", "object_owner_id":"to_id"})
            fcomments.rename({"from_id":"from_id", "object_owner_id":"to_id"})

            fcommon = graphlab.SFrame()
            fcommon = fcommon.append(fshares)
            fcommon = fcommon.append(flikes)
            fcommon = fcommon.append(fcomments)

            fcommon = fcommon.groupby(["to_id", "from_id"], {"total":graphlab.aggregate.SUM("total")})

            gcommon = SGraph()
            gcommon = gcommon.add_edges(fcommon, src_field="from_id", dst_field="to_id")
            fcommon.save(upfolder_cmn_and_ccs+"\\common_sframe_"+period+("city_%s" % city_str))
            gcommon.save(upfolder_cmn_and_ccs+"\\common_sgraph_"+period+("city_%s" % city_str))

            print(gshares)
            print(glikes)
            print(gcomments)
            print(gcommon)
            
            summary = {}
            suppl = {}
            summary["protest"] = (str(city) in cities_prot) or (int(city) in cities_prot)
            summary["city_id"] = int(city)
            summary["city_name"] = sm_cities.get_value(int(city), "city_name_eng")
            summary["city_pop_stata"] = sm_cities.get_value(int(city), "pop")
            summary_temp, suppl_temp = get_graph_measures(gshares)
            for dkey in summary_temp:
                summary[dkey+"_shares"] = summary_temp[dkey]
            for dkey in suppl_temp:
                suppl[dkey+"_shares"] = suppl_temp[dkey]
            del summary_temp
            del suppl_temp
            summary_temp, suppl_temp = get_graph_measures(glikes)
            for dkey in summary_temp:
                summary[dkey+"_likes"] = summary_temp[dkey]
            for dkey in suppl_temp:
                suppl[dkey+"_likes"] = suppl_temp[dkey]
            del summary_temp
            del suppl_temp
            summary_temp, suppl_temp = get_graph_measures(gcomments)
            for dkey in summary_temp:
                summary[dkey+"_comments"] = summary_temp[dkey]
            for dkey in suppl_temp:
                suppl[dkey+"_comments"] = suppl_temp[dkey]
            del summary_temp
            del suppl_temp
            summary_temp, suppl_temp = get_graph_measures(gcommon)
            for dkey in summary_temp:
                summary[dkey+"_common"] = summary_temp[dkey]
            for dkey in suppl_temp:
                suppl[dkey+"_common"] = suppl_temp[dkey]
            del summary_temp
            del suppl_temp
            
            print("intersection of top %d by in_degree vertices in different graphs" % 50)
            intersection_in_digree = set(suppl["top_in_degree_common"]).intersection(set(suppl["top_in_degree_shares"])).intersection(set(suppl["top_in_degree_likes"])).intersection(set(suppl["top_in_degree_comments"]))
            summary["num_nodes_from_top100_indegree_in_each_collection"] = len(intersection_in_digree)
            
            print("intersection of top %d by pagerank vertices in different graphs" % 100)
            intersection_pagerank = set(suppl["top_pagerank_common"]).intersection(set(suppl["top_pagerank_shares"])).intersection(set(suppl["top_pagerank_likes"])).intersection(set(suppl["top_pagerank_comments"]))
            summary["num_nodes_from_top100_pagerank_in_each_collection"] = len(intersection_pagerank)

            suppl["max_cc_shares"].save(upfolder_cmn_and_ccs+"\\max_cc_sh_"+period+("city_%s" % city_str), format="csv")
            suppl["max_cc_likes"].save(upfolder_cmn_and_ccs+"\\max_cc_lk_"+period+("city_%s" % city_str), format="csv")
            suppl["max_cc_comments"].save(upfolder_cmn_and_ccs+"\\max_cc_cmt_"+period+("city_%s" % city_str), format="csv")
            suppl["max_cc_common"].save(upfolder_cmn_and_ccs+"\\max_cc_comn_"+period+("city_%s" % city_str), format="csv")
            
            for v in summary:
                summary[v] = [summary[v]]
            sf = graphlab.SFrame(summary)
            sf_main = sf_main.append(sf)
            sf_main.save(summary_name)
            sf_main.save(summary_name+"_csv", format="csv")
        except:
            continue

In [None]:
main()