In [136]:
from lib import *
from pathcensus import PathCensus
from pathcensus.nullmodels import UBCM
from pathcensus.inference import Inference
from pathcensus.utils import set_seed
import joblib

import igraph as ig
import re

In [103]:

def statistics(graph: ig.Graph) -> pd.DataFrame:
    """Function for calculating graph statistics."""
    paths = PathCensus(graph)
    coefs = paths.coefs("nodes")
    df = pd.DataFrame({
        "sim_g":   paths.similarity("global"),
        "sim":     coefs["sim"].mean(),
        "sim_e":   paths.similarity("edges").mean(),
        "comp_g":  paths.complementarity("global"),
        "comp":    coefs["comp"].mean(),
        "comp_e":  paths.complementarity("edges").mean(),
        "coefs":   [coefs]
    }, index=[0])
    return df

def get_metadata(graph: ig.Graph):
    """Get graph metadata dictionary."""
    name = graph["name"]
    return {
        "dataset":    name.split()[0],
        "name":       name,
        "domain":     graph["domain"],
        "relation":   graph["relation"],
        "desc":       graph["desc"],
        "label":      graph["label"],
        "idx":        graph["idx"] if "idx" in graph.attributes() else 0
    }


def get_largest_component(graph: ig.Graph, **kwds) -> ig.Graph:
    """Get largest component of a graph.

    ``**kwds`` are passed to :py:meth:`igraph.Graph.components`.
    """
    vids = None
    for component in graph.components(**kwds):
        if vids is None or len(component) > len(vids):
            vids = component
    return graph.induced_subgraph(vids)

In [104]:
def get_digits_from_string(string: str) -> str:
    """Get digits from string."""
    return re.sub("[^0-9]", "", string)

In [135]:
rawdata = []


In [105]:
N_SAMPLES = 200

In [146]:
file_filter = lambda file: file.is_file()
friendship_networks_file_names = list(filter(file_filter, (DATA_DIR_PATH / "offline").glob("**/friendship*")))
health_advice_networks_file_names = list(filter(file_filter, (DATA_DIR_PATH / "offline").glob("**/health-advice*")))


In [149]:
# sort the networks
friendship_networks_file_names = sorted(
    friendship_networks_file_names,
    key=lambda path: get_digits_from_string(path.name).zfill(2),
)

health_advice_networks_file_names = sorted(
    health_advice_networks_file_names,
    key=lambda path: get_digits_from_string(path.name).zfill(2),
)

health_advice_networks_file_names[:3]

[PosixPath('/home/af/Projects/uzh-network-science-project/data/offline/medium/ugandan_village/health-advice_1.gml'),
 PosixPath('/home/af/Projects/uzh-network-science-project/data/offline/medium/ugandan_village/health-advice_2.gml'),
 PosixPath('/home/af/Projects/uzh-network-science-project/data/offline/medium/ugandan_village/health-advice_3.gml')]

In [151]:
health_advice_networks_file_names[0].stem

'health-advice_1'

In [160]:
ugandan_village_networks = health_advice_networks_file_names + friendship_networks_file_names

In [169]:
rawdata = []

In [173]:
# fit null models for all networks and append dataframes containing 
# properties to rawdata list
for network_path in ugandan_village_networks:
    dataset = "ugandan_village"
    network_name = network_path.stem
    domain = "social"
    relation = "friendship" if "friendship" in network_name else "health-advice"
    desc = "offline"
    label = (
        "Friendship ({})".format(get_digits_from_string(str(network_path)))
        if "friendship" in network_name
        else "Advice ({})".format(get_digits_from_string(str(network_path)))
    )
    graph = ig.Graph.Read_GML(gml_cleaner(network_path))
    n_nodes = graph.vcount()
    lc  = graph.connected_components().giant()
    frac_total = lc.vcount() / graph.vcount()
    degseq  = np.array(graph.degree())



    net = pd.DataFrame({
        "dataset": dataset,
        "network_name": network_name,
        "domain": domain,
        "relation": relation,
        "desc": desc,
        "label": label,
        "n_nodes": n_nodes,
        "frac_total": frac_total,
        "density":    graph.density(),
        "dbar":       degseq.mean(),
        "dcv":        degseq.std() / degseq.mean(),
        "dmax":       degseq.max()

    }, index=[0])

    # fit UBCM null model
    model = UBCM(graph)
    model.fit()
    model.validate()
    infer = Inference(graph, model, statistics)
    data, null = infer.init_comparison(n=N_SAMPLES)
    # Estimate fractions of significant nodes
    odf = pd.concat(data.pop("coefs").tolist())
    ndf = pd.concat(null.pop("coefs").tolist())

    infer.add_stats_index(odf)
    infer.add_stats_index(ndf)

    odf = pd.concat([odf], keys=[0], names=["_"])
    ndf = pd.concat([ndf], keys=[0], names=["_"])

    alpha = 0.01
    pvals = infer.estimate_pvalues(odf, ndf, alpha=alpha, adjust=True)
    sig   = (pvals <= alpha)[["sim", "comp"]]

    sig["both"] = sig.all(axis=1)
    sig = sig.mean().to_frame().T

    # Compute calibrated coefficients
    cdata = np.log(data / null).reset_index(drop=True) \
        .replace([np.inf, -np.inf], np.nan) \
        .dropna() \
        .mean() \
        .to_frame().T

    net["rawdata"]     = [data]
    net["calibrated"]  = [cdata]
    net["significant"] = [sig]
    rawdata.append(net)


In [174]:
rawdata[0]

Unnamed: 0,dataset,network_name,domain,relation,desc,label,n_nodes,frac_total,density,dbar,dcv,dmax,rawdata,calibrated,significant
0,ugandan_village,health-advice_1,social,health-advice,offline,Advice (1),190,0.984211,0.026344,4.978947,1.271589,61,sim_g sim sim_e comp_g ...,sim_g sim sim_e comp_g ...,sim comp both 0 0.015789 0...


In [175]:
villages = pd.concat(rawdata, axis=0, ignore_index=True)

# Save data -------------------------------------------------------------------

joblib.dump(villages, DATA_DIR_PATH / "social.pkl.gz", compress=True)

['/home/af/Projects/uzh-network-science-project/data/social.pkl.gz']

In [109]:
# get one network for testing the workflow

In [110]:
friendship_1_path = friendship_networks_file_names[0]

In [111]:
friendship_1 = ig.Graph.Read_GML(gml_cleaner(friendship_1_path))

In [112]:
# get (size of) largest component in the good way
largest_component = friendship_1.connected_components().giant() ; largest_component.vcount()

202

In [113]:
# get (size of) largest component like in the paper, same result as above but more complicated
get_largest_component(friendship_1).vcount()

202

In [114]:
n_nodes = friendship_1.vcount() ; n_nodes

203

In [115]:
density = friendship_1.density() ; density

0.02926401014485685

In [116]:
degseq = np.array(friendship_1.degree()); degseq # get degrees 

array([ 8, 11,  5, 15,  6, 16, 11,  4,  6, 15,  8, 11, 15,  3,  6,  7,  5,
        3,  8,  5,  7,  7, 12,  7, 38,  3,  8, 11,  3,  6,  3,  4,  5,  9,
        5, 12,  3,  6,  4,  9,  7,  4,  3,  6,  4,  5,  5,  4,  3,  2,  6,
        6, 19,  6,  5,  5,  7,  8, 10,  9,  3, 13,  3,  9,  4,  1,  4,  4,
        3,  3,  6,  3,  7, 17,  9,  8,  4,  3,  4,  5, 11,  8, 15,  8, 13,
       17,  5,  8,  1,  7,  7,  8,  4,  5,  3,  4,  4,  9, 14, 10,  4,  4,
        4,  1,  7,  5,  9,  2, 11, 14,  2,  4,  6,  4,  7,  3,  8,  8,  3,
        4,  9,  6,  2, 10,  3,  3,  8,  3,  6, 17,  1,  6,  6,  6, 12,  3,
        2,  1,  8,  3,  6,  4,  5,  4, 10,  3,  5,  5,  5,  1,  8,  4,  7,
        5,  3,  5,  3,  7,  1,  3,  2,  6,  3,  6,  2,  3,  4,  1,  2,  1,
        8,  3,  3,  2,  1,  2,  5,  7,  1,  5,  7,  4,  3,  5,  7,  4,  2,
        9,  2,  5, 16,  4,  1,  2,  2,  1,  1,  2,  1,  3,  3,  1,  1])

In [117]:
# get fraction of largest connected component
largest_component.vcount() / friendship_1.vcount()

0.9950738916256158

In [118]:
dbar = degseq.mean() ; dbar # get average degree

5.911330049261084

In [119]:
dcv = degseq.std() / dbar ; dcv # get degree coefficient of variation

0.7396142012337694

In [120]:
dmax = degseq.max() ; dmax # get maximum degree

38

In [121]:
friendship_1.is_directed()

False

In [122]:
net = pd.DataFrame({
        "graph":      [friendship_1],
        "n_nodes":    friendship_1.vcount(),
        "frac_total": largest_component.vcount() / friendship_1.vcount(),
        "density":    friendship_1.density(),
        "dbar":       degseq.mean(),
        "dcv":        degseq.std() / degseq.mean(),
        "dmax":       degseq.max()
    }, index=[0])

In [123]:
net

Unnamed: 0,graph,n_nodes,frac_total,density,dbar,dcv,dmax
0,"IGRAPH U--- 203 600 --\n+ attr: id (v), id (e)...",203,0.995074,0.029264,5.91133,0.739614,38


In [124]:
# fit UBCM null model
model = UBCM(friendship_1)
model.fit()
model.validate()


<pathcensus.nullmodels.ubcm.UBCM at 0x7fc3577f5d30>

In [125]:
infer = Inference(friendship_1, model, statistics)
data, null = infer.init_comparison(n=N_SAMPLES)


In [126]:
# Estimate fractions of significant nodes
odf = pd.concat(data.pop("coefs").tolist())
ndf = pd.concat(null.pop("coefs").tolist())


In [127]:
infer.add_stats_index(odf)
infer.add_stats_index(ndf)

Unnamed: 0_level_0,Unnamed: 1_level_0,sim,tclust,tclosure,comp,qclust,qclosure
i,di,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,6.0,0.066667,0.071429,0.062500,0.020683,0.022523,0.019120
1,7.0,0.117647,0.138889,0.102041,0.029777,0.031008,0.028640
2,4.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,14.0,0.031496,0.035714,0.028169,0.031128,0.032323,0.030019
4,6.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
198,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
199,2.0,,,,,,
200,3.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
201,1.0,0.000000,,0.000000,0.000000,,0.000000


In [128]:
odf = pd.concat([odf], keys=[0], names=["_"])
ndf = pd.concat([ndf], keys=[0], names=["_"])

In [129]:
odf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sim,tclust,tclosure,comp,qclust,qclosure
_,i,di,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,6.0,0.246914,0.333333,0.196078,0.043143,0.057143,0.034653
0,1,7.0,0.156863,0.190476,0.133333,0.039360,0.045455,0.034707
0,2,4.0,0.195122,0.333333,0.137931,0.000000,0.000000,0.000000
0,3,14.0,0.136054,0.109890,0.178571,0.035614,0.026462,0.054441
0,4,6.0,0.141176,0.200000,0.109091,0.018018,0.022305,0.015113
0,...,...,...,...,...,...,...,...
0,198,1.0,0.000000,,0.000000,0.000000,,0.000000
0,199,2.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,200,3.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,201,1.0,0.000000,,0.000000,0.000000,,0.000000


In [130]:
alpha = 0.01
pvals = infer.estimate_pvalues(odf, ndf, alpha=alpha, adjust=True)
sig   = (pvals <= alpha)[["sim", "comp"]]

In [131]:
sig["both"] = sig.all(axis=1)
sig = sig.mean().to_frame().T

In [132]:
# Compute calibrated coefficients
cdata = np.log(data / null).reset_index(drop=True) \
    .replace([np.inf, -np.inf], np.nan) \
    .dropna() \
    .mean() \
    .to_frame().T

net["rawdata"]     = [data]
net["calibrated"]  = [cdata]
net["significant"] = [sig]


In [133]:
net

Unnamed: 0,graph,n_nodes,frac_total,density,dbar,dcv,dmax,rawdata,calibrated,significant
0,"IGRAPH U--- 203 600 --\n+ attr: id (v), id (e)...",203,0.995074,0.029264,5.91133,0.739614,38,sim_g sim sim_e comp_g ...,sim_g sim sim_e comp_g ...,sim comp both 0 0.009852 0...


In [138]:
rawdata.append(net)

In [140]:
villages = pd.concat(rawdata, axis=0, ignore_index=True)

# Save data -------------------------------------------------------------------

joblib.dump(villages, DATA_DIR_PATH / "villages.pkl.gz", compress=True)

['/home/af/Projects/uzh-network-science-project/data/villages.pkl.gz']