In [96]:
# reload imported files automatically without restarting the kernel
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
from lib import *
from pathcensus import PathCensus
from pathcensus.nullmodels import UBCM
from pathcensus.inference import Inference
from pathcensus.utils import set_seed
import joblib

In [98]:
def statistics(graph: ig.Graph) -> pd.DataFrame:
    """Function for calculating graph statistics."""
    paths = PathCensus(graph)
    coefs = paths.coefs("nodes")
    df = pd.DataFrame({
        "sim_g":   paths.similarity("global"),
        "sim":     coefs["sim"].mean(),
        "sim_e":   paths.similarity("edges").mean(),
        "comp_g":  paths.complementarity("global"),
        "comp":    coefs["comp"].mean(),
        "comp_e":  paths.complementarity("edges").mean(),
        "coefs":   [coefs]
    }, index=[0])
    return df


In [99]:
rawdata = []

In [100]:
N_SAMPLES = 200

In [101]:
file_filter = lambda file: file.is_file()
friendship_networks_file_names = list(filter(file_filter, (DATA_DIR_PATH / "offline").glob("**/friendship*")))
health_advice_networks_file_names = list(filter(file_filter, (DATA_DIR_PATH / "offline").glob("**/health-advice*")))


In [102]:
# sort the networks
friendship_networks_file_names = sorted(
    friendship_networks_file_names,
    key=lambda path: get_digits_from_string(path.name).zfill(2),
)

health_advice_networks_file_names = sorted(
    health_advice_networks_file_names,
    key=lambda path: get_digits_from_string(path.name).zfill(2),
)

health_advice_networks_file_names[:3]

[PosixPath('/home/af/Projects/uzh-network-science-project/data/offline/medium/ugandan_village/health-advice_1.gml'),
 PosixPath('/home/af/Projects/uzh-network-science-project/data/offline/medium/ugandan_village/health-advice_2.gml'),
 PosixPath('/home/af/Projects/uzh-network-science-project/data/offline/medium/ugandan_village/health-advice_3.gml')]

In [103]:
ugandan_village_networks = friendship_networks_file_names + health_advice_networks_file_names

In [104]:
set_seed(1019)


In [105]:
def preprocess_graph(g):
    g = nx.Graph(g) # remove multiedges if graph is multigraph
    g.remove_edges_from(list(nx.selfloop_edges(g))) # remove self-loops
    largest_cc = max(nx.connected_components(g), key=len) # get largest connected component
    return g.subgraph(largest_cc).copy()


In [106]:
for network_path in ugandan_village_networks:
    g = nx.read_gml(gml_cleaner(network_path), label="id") # read graph
    n_total = g.number_of_nodes() # get total number of nodes
    g = preprocess_graph(g) # remove self-loops and multiedges and get largest connected component
    n_giant = g.number_of_nodes() # get number of nodes in largest connected component
    degseq = sorted([d for n, d in g.degree()], reverse=True) # get degree sequence

    dataset = "ugandan_village"
    network_name = network_path.stem
    domain = "social"
    relation = "friendship" if "friendship" in network_name else "health advice"
    desc = "offline"

    label = (
        "Friendship ({})".format(get_digits_from_string(str(network_path)))
        if "friendship" in network_name
        else "Advice ({})".format(get_digits_from_string(str(network_path)))
    )

    net = pd.DataFrame({
        "idx": int(get_digits_from_string(str(network_path))),
        "dataset": dataset,
        "name": network_name,
        "domain": domain,
        "relation": relation,
        "desc": desc,
        "label": label, 
        "graph": [g], # get graph
        "n_nodes": n_giant, # get number of nodes in largest connected component
        "frac_total": n_giant / n_total, # get fraction of nodes in largest connected component
        "density":    nx.density(g), # get density
        "dbar":       np.mean(degseq), # get mean degree
        "dcv":        np.std(degseq) / np.mean(degseq), # get coefficient of variation of degree
        "dmax":       np.max(degseq) # get maximum degree
    }, index=[0])


     # fit UBCM null model
    model = UBCM(g) # initialize model
    model.fit() # fit model     
    model.validate() # validate model
    # compare null model to actual graph using statistics function
    infer = Inference(g, model, statistics) 
    data, null = infer.init_comparison(n=N_SAMPLES) 

    # Estimate fractions of significant nodes
    odf = pd.concat(data.pop("coefs").tolist())
    ndf = pd.concat(null.pop("coefs").tolist())

    infer.add_stats_index(odf)
    infer.add_stats_index(ndf)

    odf = pd.concat([odf], keys=[0], names=["_"])
    ndf = pd.concat([ndf], keys=[0], names=["_"])

    alpha = 0.01
    pvals = infer.estimate_pvalues(odf, ndf, alpha=alpha, adjust=True)
    sig   = (pvals <= alpha)[["sim", "comp"]]

    sig["both"] = sig.all(axis=1)
    sig = sig.mean().to_frame().T

    # Compute calibrated coefficients
    cdata = np.log(data / null).reset_index(drop=True) \
        .replace([np.inf, -np.inf], np.nan) \
        .dropna() \
        .mean() \
        .to_frame().T

    net["rawdata"]     = [data]
    net["calibrated"]  = [cdata]
    net["significant"] = [sig]
    rawdata.append(net)

In [107]:
villages = pd.concat(rawdata, axis=0, ignore_index=True) # concatenate dataframes
joblib.dump(villages, DATA_DIR_PATH / "social.pkl.gz", compress=True) # save data

['/home/af/Projects/uzh-network-science-project/data/social.pkl.gz']