## We run the methods of the paper on additional social networks
### In this notebook, we run it on large offline social networks

In [1]:
# reload imported files automatically without restarting the kernel
%load_ext autoreload
%autoreload 2

In [2]:
from lib import *
from pathcensus import PathCensus
from pathcensus.nullmodels import UBCM
from pathcensus.inference import Inference
from pathcensus.utils import set_seed
from mandarina.benchmark import timer
import networkx as nx


In [3]:
@timer
def calculate_structural_measures(network_name, g, n_samples_null_model):
    """
    This function calculates the structural measures of a network and 
    its null model average values by fitting a UBCM null model to the network.
    """
    n_total = g.number_of_nodes()  # get total number of nodes
    # remove self-loops and multiedges and get largest connected component
    g = preprocess_graph(g)  ; g
    n_giant = g.number_of_nodes()  # get number of nodes in largest connected component
    degseq = sorted([d for n, d in g.degree()], reverse=True)  # get degree sequence
    dataset = ""
    network_name = network_name.split(".")[0]
    label = "large offline"
    
    properties = {
            "idx": 1,
            "dataset": dataset,
            "name": network_name,
            #"graph": [g],  # get graph
            "n_nodes": n_giant,  # get number of nodes in largest connected component
            "frac_total": n_giant
            / n_total,  # get fraction of nodes in largest connected component
            "density": nx.density(g),  # get density
            "dbar": np.mean(degseq),  # get mean degree
            "dcv": np.std(degseq)
            / np.mean(degseq),  # get coefficient of variation of degree
            "dmax": np.max(degseq),  # get maximum degree
        }
    
    model = UBCM(g)  # initialize model
    model.fit()  # fit model
    model.validate()  # validate model
    # compare null model to actual graph using statistics function
    infer = Inference(g, model, statistics)
    data, null = infer.init_comparison(n=n_samples_null_model)
    original_network_values = dict(data)
    original_network_values = {'data_' + k: v for k, v in original_network_values.items()}
    null_model_mean_values = dict(null.mean(numeric_only=True)) 
    null_model_mean_values = {f'null_{n_samples_null_model}_{k}': v for k, v in null_model_mean_values.items()}
    return properties | null_model_mean_values |original_network_values | null_model_mean_values

In [4]:
OUTPUT_CSV_FILE_PATH = DATA_DIR_PATH / "structural_measures_large_offline.csv"

In [5]:
file_filter = lambda file: file.is_file()
large_offline_network_files = list(filter(file_filter, (DATA_DIR_PATH / "offline" / "large").glob("**/*")))

results = []
for i, file in enumerate(large_offline_network_files): # loop over all offline large social network files
    file_size = file.stat().st_size
    print(file_size)
    if file_size > 200000000: 
        continue
    f = gml_cleaner(file) # clean gml file
    try:
        g = nx.read_gml(f, label="id") # load into networkx
        print("Running calculations for network: ", file.name) 
        # calculate measures from original network and null model
        result = calculate_structural_measures(file.name.split(".")[0] , g, n_samples_null_model=100) 
        df = pd.DataFrame(result).reset_index()
        df[df.columns[:-1]].to_csv(OUTPUT_CSV_FILE_PATH, mode='a', header=not OUTPUT_CSV_FILE_PATH.is_file()) # append result row to csv file
        results.append(result) # collect results
    except:
        continue


2673414
Running calculations for network:  contact.gml
Function calculate_structural_measures with args (('contact', <networkx.classes.multigraph.MultiGraph object at 0x0000020CB6F63460>), {'n_samples_null_model': 100}) took: 21.5201 seconds.
215472294
480826
Running calculations for network:  fb_friends.gml
Function calculate_structural_measures with args (('fb_friends', <networkx.classes.multigraph.MultiGraph object at 0x0000020CB6F61CF0>), {'n_samples_null_model': 100}) took: 10.5513 seconds.
229225
Running calculations for network:  crime.gml
Function calculate_structural_measures with args (('crime', <networkx.classes.multigraph.MultiGraph object at 0x0000020CB9A16B30>), {'n_samples_null_model': 100}) took: 3.0528 seconds.
105477054
Running calculations for network:  InVS15.gml
Function calculate_structural_measures with args (('InVS15', <networkx.classes.multigraph.MultiGraph object at 0x0000020CB98184C0>), {'n_samples_null_model': 100}) took: 68.7959 seconds.
543498074
116183934

In [6]:
df = pd.concat([pd.DataFrame(result) for result in results]).reset_index() # combine all data to dataframe for analysis
df = df[df.columns[:-1]]

In [8]:
df.to_csv('largeoffline.csv')