## graph measures

In [None]:
#scipy.sparse.csgraph
import graphlab
from graphlab import SGraph
graphlab.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import os.path
import itertools

def get_graph_measures(mgraph):
    summary = {}
    suppl = {}
    summary["num_nodes"] = int(len(mgraph.vertices))

    def in_degree(graph, top_num):
        dgcount = graphlab.degree_counting.create(graph)
        dgc_vert = dgcount['graph'].vertices.sort("in_degree", False)
        top_set = dgc_vert["__id"][0:top_num]
        return dgcount, top_set

    dgcount, top_set_dg = in_degree(mgraph, top_num = 50)

    def convert_graphlab_to_nx(graph, di):
        nxgraph = nx.DiGraph() if di else nx.Graph()
        nxgraph.add_nodes_from(list(graph.vertices['__id']))
        nxgraph.add_edges_from([(e['__src_id'], e['__dst_id']) for e in graph.edges])
        return nxgraph

    nxmgraph = convert_graphlab_to_nx(mgraph, di=False)
    nxdeg = nxmgraph.degree()

    summary["in_degree_avg"] = 1.0*sum(dgcount["graph"].vertices["in_degree"])/len(dgcount["graph"].vertices)       
    summary["in_degree_max"] = max(dgcount["graph"].vertices["in_degree"])        
    summary["degree_avg_undirected"] = 1.0*sum(nxdeg.values())/len(nxdeg)
    summary["degree_max_undirected"] = max(nxdeg.values())
    suppl["top_in_degree"] = top_set_dg
    
    def pagerank(graph, top_num):
        pr = graphlab.pagerank.create(graph)
        pr_sort = pr['pagerank'].sort("pagerank", False)
        top_set = pr_sort["__id"][0:top_num]
        return pr, top_set

    prank, top_pagerank = pagerank(mgraph, top_num = 100)
    suppl["top_pagerank"] = top_pagerank

    gcolor = graphlab.graph_coloring.create(mgraph)
    summary["upper_bound_num_colors"] = gcolor.num_colors

    def connected_components(graph, visualisation = False):
        cc = graphlab.connected_components.create(graph)
        cc_sort = cc.component_size.sort("Count", False)
        #print(cc.summary())
        if(visualisation):
            cc_grouped = cc_sort.groupby("Count", {"total" : graphlab.aggregate.COUNT})
            cc_grouped = cc_grouped.sort("Count")
            y_pos = range(cc_grouped.num_rows())
            plt.barh(y_pos, cc_grouped["total"], align='center', alpha=0.8)
            plt.yticks(y_pos, cc_grouped["Count"])
            plt.xlabel("number of components")
            plt.title("number of components of each size")
            plt.show()
        max_component_size = cc_sort["Count"][0]
        graph.vertices['component_id'] = cc['graph'].vertices['component_id']
        targets = cc["graph"].get_vertices(fields={"component_id":cc_sort["component_id"][0]})["__id"]
        max_component = graph.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
        return cc, max_component, max_component_size

    cc, max_cc, max_cc_size = connected_components(mgraph, visualisation = True)
    suppl["max_cc"] = max_cc
    summary["max_cc_size"] = max_cc_size
    summary["max_cc_relative_size"] = 1.0*max_cc_size/mgraph.summary()['num_vertices']

    tc = graphlab.triangle_counting.create(max_cc)
    tc_num = tc['num_triangles']
    tcount = tc['triangle_count']
    #print(tc_num)
    #tcount = tcount.sort('triangle_count', False)
    #print(tcount["triangle_count"][0]) - maximal number of triangles for a node
    tcount_not_in_triangles = tcount.filter_by(0, "triangle_count")
    summary["num_triangles_in_max_cc"] = tc_num
    summary["relnum_nodes_form_triangles_in_cc"] = tcount.num_rows() - tcount_not_in_triangles.num_rows()

    def k_core(graph):
        kc = graphlab.kcore.create(graph,0,1000)
        kcore_id = kc['core_id']
        max_k = max(kcore_id["core_id"])
        kcore_id["max"] = [True if (kcore_id["core_id"][i] == max_k) else False for i in xrange(kcore_id.num_rows())]
        kcore_id = kcore_id.filter_by(True, "max")
        return max_k, kcore_id.num_rows()

    max_k, max_core_size = k_core(mgraph)
    summary["num_cores"] = max_k
    summary["rel_max_core_size"] = 1.0*max_core_size/max_cc_size

    nx_max_cc = convert_graphlab_to_nx(max_cc, di=False)
    avg_clust = nx.average_clustering(nx_max_cc)
    summary["avg_clust_coef"] = avg_clust

    return summary, suppl


## example

In [None]:
graphid = 0
path = "mygraph"
mgraph = graphlab.load_sgraph(path)
summary, suppl = get_graph_measures(mgraph)
summary["graph_id"] = int(graphid)
suppl["max_cc"].save("max_cc_"+("graph_%s" % str(graphid)), format="csv")
for v in summary:
    summary[v] = [summary[v]]
sf = graphlab.SFrame(summary)
sf.save("summary_main")
sf

## combine with Networkit results

In [None]:
import cPickle
import graphlab
from graphlab import SGraph
graphlab.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd

#summary_name = "summary_main"
#summ_ntwk = "summary_networkit.pckl"
#save_name = "graph_measures"

def combine_with_networkit(summary_name, summ_ntwk, save_name):
    sf_main = graphlab.load_sframe(summary_name)
    df_main = pd.read_pickle(summ_ntwk)
    df_main["graph_id"] = df_main["graph_id"].astype(int)
    sf_main1 = graphlab.SFrame(data=df_main)
    if(len(set(sf_main.column_names()).intersection(set(sf_main1.column_names())))>1):
        print("Sframes to combine have more than 1 equal column")
        return
    if(not "graph_id" in sf_main.column_names()):
        print("First sframe does not have the column graph_id")
        return
    sf_main = sf_main.join(sf_main1, how = "inner")
    sf_main = sf_main.sort("graph_id")
    sf_main.save(save_name)
    df_main = sf_main.to_dataframe()
    df_main.to_csv(save_name+'.csv',index=False)
    df_main.to_excel(save_name+'.xls',index=False)
    return

In [None]:
combine_with_networkit("summary_main", "summary_networkit.pckl", "graph_measures")

In [None]:
def main(summary_name_, graph_ids):
    upfolder_ccs = "max_ccs"
    upfolder = "my_sgraphs"

    summary_name = summary_name_

    sf_main = graphlab.SFrame()
    if os.path.exists(summary_name):
        sf_main = graphlab.load_sframe(summary_name)

    for graphid in graph_ids:
        try:
            if("graph_id" in sf_main.column_names())and(int(graphid) in list(sf_main["graph_id"])):
                print(graphid, "graph has already been analyzed")
                continue
            gid_str = str(graphid)
            print("analysis of the graph %s" % gid_str)          
            cur_sgraph = graphlab.load_sgraph(upfolder+"\\sgraph_"+gid_str)
            
            summary = {}
            suppl = {}
            summary, suppl = get_graph_measures(cur_sgraph)
            summary["graph_id"] = int(graphid)
            suppl["max_cc"].save(upfolder_ccs+"\\max_cc_"+("graph_%s" % gid_str), format="csv")
            
            for v in summary:
                summary[v] = [summary[v]]
            sf = graphlab.SFrame(summary)
            sf_main = sf_main.append(sf)
            sf_main.save(summary_name)
            sf_main.save(summary_name+"_csv", format="csv")
        except:
            continue

In [None]:
main(summary_name_ = "summary_main", graph_ids = range(10))