## We run the methods of the paper on additional social networks
### In this notebook, we run it on small offline social networks

In [6]:
# reload imported files automatically without restarting the kernel
%load_ext autoreload
%autoreload 2

In [7]:
from lib import *
from pathcensus import PathCensus
from pathcensus.nullmodels import UBCM
from pathcensus.inference import Inference
from pathcensus.utils import set_seed
import networkx as nx


In [8]:
def preprocess_graph(g):
    g = nx.Graph(g)  # remove multiedges if graph is multigraph
    g.remove_edges_from(list(nx.selfloop_edges(g)))  # remove self-loops
    largest_cc = max(
        nx.connected_components(g), key=len
    )  # get largest connected component
    return g.subgraph(largest_cc).copy()

In [9]:
def calculate_structural_measures(network_name, g, n_samples_null_model):
    """
    This function calculates the structural measures of a network and 
    its null model average values by fitting a UBCM null model to the network.
    """
    n_total = g.number_of_nodes()  # get total number of nodes
    # remove self-loops and multiedges and get largest connected component
    #g = preprocess_graph(g)  ; g
    
    n_giant = g.number_of_nodes()  # get number of nodes in largest connected component
    degseq = sorted([d for n, d in g.degree()], reverse=True)  # get degree sequence
    dataset = ""
    network_name = network_name.split(".")[0]
    label = "large online"
    
    properties = {
            "idx": 1,
            "dataset": dataset,
            "name": network_name,
            #"graph": [g],  # get graph
            "n_nodes": n_giant,  # get number of nodes in largest connected component
            "frac_total": n_giant
            / n_total,  # get fraction of nodes in largest connected component
            "density": nx.density(g),  # get density
            "dbar": np.mean(degseq),  # get mean degree
            "dcv": np.std(degseq)
            / np.mean(degseq),  # get coefficient of variation of degree
            "dmax": np.max(degseq),  # get maximum degree
        }
    
    model = UBCM(g)  # initialize model
    model.fit()  # fit model
    model.validate()  # validate model
    # compare null model to actual graph using statistics function
    infer = Inference(g, model, statistics)
    data, null = infer.init_comparison(n=n_samples_null_model)
    original_network_values = dict(data)
    original_network_values = {'data_' + k: v for k, v in original_network_values.items()}
    null_model_mean_values = dict(null.mean(numeric_only=True)) 
    null_model_mean_values = {f'null_{n_samples_null_model}_{k}': v for k, v in null_model_mean_values.items()}
    return properties | null_model_mean_values |original_network_values | null_model_mean_values

In [10]:
OUTPUT_CSV_FILE_PATH = DATA_DIR_PATH / "structural_measures_small_offline.csv"

In [11]:
file_filter = lambda file: file.is_file()
small_offline_network_files = list(filter(file_filter, (DATA_DIR_PATH / "offline" / "small").glob("**/*")))

results = []
for i, file in enumerate(small_offline_network_files): # loop over all online large social network files
    file_size = file.stat().st_size
    print(file_size)
    if file_size > 200000000: 
        continue
    f = gml_cleaner(file) # clean gml file
    g = nx.read_gml(f, label="id") # load into networkx
    print("Running calculations for network: ", file.name) 
    # calculate measures from original network and null model
    result = calculate_structural_measures(file.name.split(".")[0] , g, n_samples_null_model=100) 
    df = pd.DataFrame(result).reset_index()
    df[df.columns[:-1]].to_csv(OUTPUT_CSV_FILE_PATH, mode='a', header=not OUTPUT_CSV_FILE_PATH.is_file()) # append result row to csv file
    results.append(result) # collect results


7804
Running calculations for network:  77.gml
7896
Running calculations for network:  78.gml
8032
Running calculations for network:  november17.gml
2570133
Running calculations for network:  sp_hospital.gml
1996401
Running calculations for network:  sp_kenyan_households.gml
16319
Running calculations for network:  terrorists_911.gml


In [12]:
df = pd.concat([pd.DataFrame(result) for result in results]).reset_index() # combine all data to dataframe for analysis
df = df[df.columns[:-1]]

In [13]:
df

Unnamed: 0,_,idx,dataset,name,n_nodes,frac_total,density,dbar,dcv,dmax,...,null_100_sim_e,null_100_comp_g,null_100_comp,null_100_comp_e,data_sim_g,data_sim,data_sim_e,data_comp_g,data_comp,data_comp_e
0,0,1,,77,34,1.0,0.137255,4.529412,0.828221,16,...,0.24461,0.056841,0.048548,0.057792,0.258317,0.237123,0.276156,0.062609,0.048105,0.057827
1,0,1,,78,34,1.0,0.139037,4.588235,0.832643,17,...,0.243715,0.056078,0.047906,0.056902,0.255682,0.239593,0.275417,0.060734,0.047097,0.056215
2,0,1,,november17,22,1.0,0.285714,6.0,0.596708,14,...,0.441896,0.063556,0.060099,0.065486,0.528662,0.429924,0.521119,0.019119,0.013873,0.018306
3,0,1,,sp_hospital,75,1.0,11.684324,864.64,1.106573,4286,...,0.551749,0.036168,0.039116,0.038977,0.708105,0.58771,0.620922,0.018439,0.027397,0.028573
4,0,1,,sp_kenyan_households,47,1.0,30.19704,1389.06383,0.884969,4193,...,0.547151,0.057008,0.061691,0.06078,0.719568,0.686484,0.696276,0.009628,0.010332,0.010902
5,0,1,,terrorists_911,62,1.0,0.080381,4.903226,0.815551,22,...,0.16061,0.077721,0.057134,0.073036,0.360882,0.292575,0.365698,0.014225,0.010214,0.013378


In [36]:
from typing import Any, Optional, Dict, Iterable
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from matplotlib import patheffects
import matplotlib.pyplot as plt
import joblib
from src.utils import get_root_path


sns.set_theme(
    style="ticks",
    font_scale=1.5,
    rc={ 
        "figure.figsize": (15, 8), 
        "font.size": 14,
    }
)

COLORS = (
    "#EB7159", 
    "#66C8E6", 
    "#AB65B5", 
    "#829F49",
    "#6876CE",
    "#BE8A3B",
    "#CF5786",
    "#AD483A"
) 

In [37]:
def plot_spectrum(
    data: pd.DataFrame,
    ax: mpl.axes.Axes,
    *,
    colors: Iterable[str] = COLORS[:2],
    coefs: Iterable[str] = ("sim", "comp"),
    ci: float = .95,
    plot_kws: Optional[Dict] = None,
    scatter_kws: Optional[Dict] = None,
    ci_kws: Optional[Dict] = None,
    logx: bool = True,
    logy: bool = False
) -> None:
    """Plot structural coefficients spectrum for a given network.
    
    Parameters
    ----------
    data
        Data frame with observed and null model data
        grouped in node degree bins.
    ax
        Axes to use for plotting.
    """
    assert len(coefs) == len(colors),\
    "'coefs' and 'colors' must have the same length"

    alpha = 1 - ci
    cols  = [ "dbin", *coefs ]

    odf = data.loc[data["which"] == "observed", cols]
    ndf = data.loc[data["which"] == "randomized", cols] 
    
    mean = ndf.groupby("dbin").mean().reset_index()
    low  = ndf.groupby("dbin").quantile(alpha/2).reset_index()
    high = ndf.groupby("dbin").quantile(1 - alpha/2).reset_index()

    plot_kws    = { **LINE_PARAMS, **(plot_kws or {}) }
    scatter_kws = { "edgecolors": "white", "zorder": 100, **(scatter_kws or {}) }
    ci_kws      = ci_kws or {}

    # Plot observed trend
    for coef, color in zip(coefs, colors):
        # plot line
        args = (odf["dbin"], odf[coef])
        kws  = { "color": color, "ls": "-", **plot_kws }
        ax.plot(*args, **kws)
        # plot markers
        kws = { "color": color, "marker": "o", "s": 150, **scatter_kws }
        ax.scatter(*args, **kws)
    
    # Plot null model trend
    for coef, color in zip(coefs, colors):
        # plot line
        args = (mean["dbin"], mean[coef])
        kws  = { "color": color, "ls": "--", **plot_kws }
        ax.plot(*args, **kws)
        # plot CI
        args = (mean["dbin"], low[coef], high[coef])
        kws = { "color": color, "alpha": .2, **ci_kws }
        ax.fill_between(*args, **kws)

        label = data["label"].iloc[0]
        ax.set_title(label, **FONTS)

    # Customize aethetics
    ax.tick_params(axis="both", which="major", labelsize=12)
    ax.tick_params(axis="both", which="minor", labelsize=12)
    formatter = mpl.ticker.FormatStrFormatter("%.2f")
    ax.yaxis.set_major_formatter(formatter)
    # Transform axes
    if logx:
        ax.set_xscale("log", base=2)
    if logy:
        ax.set_yscale("log", base=2)


def get_legend_spec(
    lw: int = 6,
    *,
    colors: Iterable[str] = COLORS[:2],
    labels: Iterable[str] = (r"$s_i$", r"$c_i$")
) -> Iterable:
    """Get legend specification for the plot.
    
    Parameters
    ----------
    lw
        Line width.
    """
    assert len(colors) == len(labels),\
        "'colors' and 'labels' must have the same length"
    lines = [
        mpl.lines.Line2D([0], [0], color=color, lw=lw)
        for color in colors 
    ]
    return lines, list(labels)

def get_legend_box_params(**kwds: Any) -> Dict:
    """Get legend box param dictionary."""
    return {
        "fontsize": 18,
        "edgecolor": "black",
        "facecolor": "#F4F4F4",
        **kwds
    }

In [42]:
import pandas as pd
pd.concat([df['data_sim'],df['data_sim_e'],df['null_100_sim_e'],df['null_100_comp_g'],df['null_100_comp'],df['null_100_comp_e'],df['data_sim_g'],df['data_sim'],df['data_sim_e'],df['data_comp_g'],df['data_comp']	,df['data_comp_e']])

0    0.237123
1    0.239593
2    0.429924
3    0.587710
4    0.686484
       ...   
1    0.056215
2    0.018306
3    0.028573
4    0.010902
5    0.013378
Length: 72, dtype: float64