## We run the methods of the paper on additional social networks
### In this notebook, we run it on large online social networks

In [22]:
# reload imported files automatically without restarting the kernel
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
from lib import *
from pathcensus import PathCensus
from pathcensus.nullmodels import UBCM
from pathcensus.inference import Inference
from pathcensus.utils import set_seed
from mandarina.benchmark import timer
import networkx as nx


In [24]:
def fit_ubcm(g):
    model = UBCM(g)  # initialize model
    model.fit()  # fit model
    model.validate()  # validate model
    return model

In [25]:
@timer
def calculate_structural_measures(network_name, g, n_samples_null_model):
    """
    This function calculates the structural measures of a network and 
    its null model average values by fitting a UBCM null model to the network.
    """
    n_total = g.number_of_nodes()  # get total number of nodes
    # remove self-loops and multiedges and get largest connected component
    g = preprocess_graph(g)  ; g
    n_giant = g.number_of_nodes()  # get number of nodes in largest connected component
    degseq = sorted([d for n, d in g.degree()], reverse=True)  # get degree sequence
    dataset = ""
    network_name = network_name.split(".")[0]
    label = "large online"
    
    properties = {
            "idx": 1,
            "dataset": dataset,
            "name": network_name,
            #"graph": [g],  # get graph
            "n_nodes": n_giant,  # get number of nodes in largest connected component
            "frac_total": n_giant
            / n_total,  # get fraction of nodes in largest connected component
            "density": nx.density(g),  # get density
            "dbar": np.mean(degseq),  # get mean degree
            "dcv": np.std(degseq)
            / np.mean(degseq),  # get coefficient of variation of degree
            "dmax": np.max(degseq),  # get maximum degree
        }
    
    model = UBCM(g)  # initialize model
    model.fit()  # fit model
    model.validate()  # validate model
    # compare null model to actual graph using statistics function
    infer = Inference(g, model, statistics)
    data, null = infer.init_comparison(n=n_samples_null_model)
    original_network_values = dict(data)
    original_network_values = {'data_' + k: v for k, v in original_network_values.items()}
    null_model_mean_values = dict(null.mean(numeric_only=True)) 
    null_model_mean_values = {f'null_{n_samples_null_model}_{k}': v for k, v in null_model_mean_values.items()}
    return properties | null_model_mean_values |original_network_values | null_model_mean_values

In [26]:
OUTPUT_CSV_FILE_PATH = DATA_DIR_PATH / "structural_measures_large_online.csv"

In [27]:
def dataset_size_filter(dataset_path, size):
    return dataset_path.stat().st_size < size

In [28]:
file_filter = lambda file: file.is_file()
large_online_network_files = list(filter(file_filter, (DATA_DIR_PATH / "online" / "large").glob("**/*")))

filtered_dataset_paths = list(filter(lambda x: dataset_size_filter(x, 20000000), large_online_network_files))

print(len(filtered_dataset_paths))

results = []
for i, file in enumerate(filtered_dataset_paths): # loop over all online large social network files
    f = gml_cleaner(file) # clean gml file
    g = nx.read_gml(f, label="id") # load into networkx
    print("Running calculations for network: ", file.name) 
    # calculate measures from original network and null model
    result = calculate_structural_measures(file.name.split(".")[0] , g, n_samples_null_model=10) 
    df = pd.DataFrame(result).reset_index()
    df[df.columns[:-1]].to_csv(OUTPUT_CSV_FILE_PATH, mode='a', header=not OUTPUT_CSV_FILE_PATH.is_file()) # append result row to csv file
    results.append(result) # collect results


8
Running calculations for network:  NYC_restaurant_tips.gml
Function calculate_structural_measures with args (('NYC_restaurant_tips', <networkx.classes.multigraph.MultiGraph object at 0x7fe398e5fdc0>), {'n_samples_null_model': 10}) took: 2.0235 seconds.
Running calculations for network:  NYC_restaurant_checkin.gml
Function calculate_structural_measures with args (('NYC_restaurant_checkin', <networkx.classes.multigraph.MultiGraph object at 0x7fe3991bc670>), {'n_samples_null_model': 10}) took: 2.4052 seconds.
Running calculations for network:  gplus_112573107772208475213.gml
Function calculate_structural_measures with args (('gplus_112573107772208475213', <networkx.classes.multigraph.MultiGraph object at 0x7fe3994ca8f0>), {'n_samples_null_model': 10}) took: 35.8538 seconds.
Running calculations for network:  gplus_115455024457484679647.gml
Function calculate_structural_measures with args (('gplus_115455024457484679647', <networkx.classes.multigraph.MultiGraph object at 0x7fe398f23880>),

In [29]:
df = pd.concat([pd.DataFrame(result) for result in results]).reset_index() # combine all data to dataframe for analysis
df = df[df.columns[:-1]]

In [30]:
df

Unnamed: 0,_,idx,dataset,name,n_nodes,frac_total,density,dbar,dcv,dmax,...,null_10_sim_e,null_10_comp_g,null_10_comp,null_10_comp_e,data_sim_g,data_sim,data_sim_e,data_comp_g,data_comp,data_comp_e
0,0,1,,NYC_restaurant_tips,5372,0.838066,0.000614,3.295607,1.569503,196,...,0.003296,0.004668,0.001366,0.002549,0.0,0.0,0.0,0.014416,0.003092,0.006515
1,0,1,,NYC_restaurant_checkin,4906,0.993922,0.001118,5.485936,1.044863,88,...,0.00348,0.004561,0.002033,0.003211,0.0,0.0,0.0,0.01163,0.005808,0.008769
2,0,1,,gplus_112573107772208475213,3794,0.998684,0.008368,31.738535,1.780662,548,...,0.085803,0.051862,0.0208,0.044003,0.174692,0.0835,0.169532,0.060478,0.02333,0.050768
3,0,1,,gplus_115455024457484679647,3464,0.99255,0.018095,62.663395,1.815547,919,...,0.182821,0.049132,0.022735,0.047572,0.284835,0.112724,0.259228,0.039766,0.023916,0.045025
4,0,1,,facebook_combined,4039,1.0,0.01082,43.691013,1.199654,1045,...,0.044221,0.038205,0.020754,0.032344,0.519174,0.31262,0.513878,0.019899,0.014564,0.022281
5,0,1,,M2,3862,1.0,0.011713,45.222165,0.654056,328,...,0.020382,0.021857,0.016437,0.018802,0.225311,0.219436,0.232565,0.020923,0.017592,0.019454
6,0,1,,L2,5524,1.0,0.006176,34.112238,0.932536,417,...,0.016706,0.018934,0.011425,0.015094,0.222391,0.207345,0.236465,0.023781,0.017187,0.021375
7,0,1,,L1,5793,1.0,0.001833,10.617297,1.73198,320,...,0.020389,0.023445,0.007287,0.017605,0.264582,0.085124,0.244411,0.038836,0.017305,0.034153
