## We run the methods of the paper on additional social networks
### In this notebook, we run it on smallonline social networks

In [1]:
# reload imported files automatically without restarting the kernel
%load_ext autoreload
%autoreload 2

In [2]:
from lib import *
from pathcensus import PathCensus
from pathcensus.nullmodels import UBCM
from pathcensus.inference import Inference
from pathcensus.utils import set_seed
from mandarina.benchmark import timer
import networkx as nx


In [3]:
@timer
def calculate_structural_measures(network_name, g, n_samples_null_model):
    """
    This function calculates the structural measures of a network and 
    its null model average values by fitting a UBCM null model to the network.
    """
    n_total = g.number_of_nodes()  # get total number of nodes
    # remove self-loops and multiedges and get largest connected component
    g = preprocess_graph(g)  ; g
    n_giant = g.number_of_nodes()  # get number of nodes in largest connected component
    degseq = sorted([d for n, d in g.degree()], reverse=True)  # get degree sequence
    dataset = ""
    network_name = network_name.split(".")[0]
    label = "small online"
    
    properties = {
            "idx": 1,
            "dataset": dataset,
            "name": network_name,
            #"graph": [g],  # get graph
            "n_nodes": n_giant,  # get number of nodes in largest connected component
            "frac_total": n_giant
            / n_total,  # get fraction of nodes in largest connected component
            "density": nx.density(g),  # get density
            "dbar": np.mean(degseq),  # get mean degree
            "dcv": np.std(degseq)
            / np.mean(degseq),  # get coefficient of variation of degree
            "dmax": np.max(degseq),  # get maximum degree
        }
    
    model = UBCM(g)  # initialize model
    model.fit()  # fit model
    model.validate()  # validate model
    # compare null model to actual graph using statistics function
    infer = Inference(g, model, statistics)
    data, null = infer.init_comparison(n=n_samples_null_model)
    original_network_values = dict(data)
    original_network_values = {'data_' + k: v for k, v in original_network_values.items()}
    null_model_mean_values = dict(null.mean(numeric_only=True)) 
    null_model_mean_values = {f'null_{n_samples_null_model}_{k}': v for k, v in null_model_mean_values.items()}
    return properties | null_model_mean_values |original_network_values | null_model_mean_values

In [4]:
OUTPUT_CSV_FILE_PATH = DATA_DIR_PATH / "structural_measures_small_online.csv"

In [5]:
file_filter = lambda file: file.is_file()
small_online_network_files = list(filter(file_filter, (DATA_DIR_PATH / "online" / "small").glob("**/*")))

results = []
for i, file in enumerate(small_online_network_files): # loop over all online small social network files
    file_size = file.stat().st_size
    print(file_size)
    if file_size > 200000000: 
        continue
    f = gml_cleaner(file) # clean gml file
    g = nx.read_gml(f, label="id") # load into networkx
    print("Running calculations for network: ", file.name) 
    # calculate measures from original network and null model
    result = calculate_structural_measures(file.name.split(".")[0] , g, n_samples_null_model=100) 
    df = pd.DataFrame(result).reset_index()
    df[df.columns[:-1]].to_csv(OUTPUT_CSV_FILE_PATH, mode='a', header=not OUTPUT_CSV_FILE_PATH.is_file()) # append result row to csv file
    results.append(result) # collect results


57449
Running calculations for network:  S2.gml
Function calculate_structural_measures with args (('S2', <networkx.classes.multigraph.MultiGraph object at 0x176796a10>), {'n_samples_null_model': 100}) took: 10.1784 seconds.
171298
Running calculations for network:  S1.gml
Function calculate_structural_measures with args (('S1', <networkx.classes.multigraph.MultiGraph object at 0x1778c1300>), {'n_samples_null_model': 100}) took: 3.0230 seconds.
263906
Running calculations for network:  facebook_friends.gml
Function calculate_structural_measures with args (('facebook_friends', <networkx.classes.multigraph.MultiGraph object at 0x10d758c40>), {'n_samples_null_model': 100}) took: 2.4783 seconds.
280129
Running calculations for network:  gplus_114054672576929802335.gml
Function calculate_structural_measures with args (('gplus_114054672576929802335', <networkx.classes.multigraph.MultiGraph object at 0x178679600>), {'n_samples_null_model': 100}) took: 5.5956 seconds.
10016
Running calculations

Function calculate_structural_measures with args (('gplus_104076158580173410325', <networkx.classes.multigraph.MultiGraph object at 0x177f857e0>), {'n_samples_null_model': 100}) took: 0.8563 seconds.
586754
Running calculations for network:  gplus_114104634069486127920.gml
Function calculate_structural_measures with args (('gplus_114104634069486127920', <networkx.classes.multigraph.MultiGraph object at 0x177f872b0>), {'n_samples_null_model': 100}) took: 22.9128 seconds.
427745
Running calculations for network:  gplus_103503116383846951534.gml
Function calculate_structural_measures with args (('gplus_103503116383846951534', <networkx.classes.multigraph.MultiGraph object at 0x177a09c90>), {'n_samples_null_model': 100}) took: 10.0112 seconds.
243113
Running calculations for network:  gplus_100637660947564674695.gml
Function calculate_structural_measures with args (('gplus_100637660947564674695', <networkx.classes.multigraph.MultiGraph object at 0x178022b00>), {'n_samples_null_model': 100}

In [6]:
df = pd.concat([pd.DataFrame(result) for result in results]).reset_index() # combine all data to dataframe for analysis
df = df[df.columns[:-1]]

In [7]:
df

Unnamed: 0,_,idx,dataset,name,n_nodes,frac_total,density,dbar,dcv,dmax,...,null_100_sim_e,null_100_comp_g,null_100_comp,null_100_comp_e,data_sim_g,data_sim,data_sim_e,data_comp_g,data_comp,data_comp_e
0,0,1,,S2,165,1.0,0.053659,8.8,0.946063,63,...,0.138034,0.072985,0.051306,0.068495,0.32976,0.258733,0.340403,0.03032,0.019471,0.027094
1,0,1,,S1,320,1.0,0.046415,14.80625,0.963127,113,...,0.120596,0.06797,0.047345,0.062566,0.287705,0.255111,0.297862,0.032432,0.022242,0.030423
2,0,1,,facebook_friends,329,0.90884,0.036215,11.878419,0.921316,63,...,0.097202,0.071421,0.045012,0.063086,0.512061,0.37995,0.511406,0.01533,0.01499,0.016834
3,0,1,,gplus_114054672576929802335,175,1.0,0.27179,47.291429,0.802294,155,...,0.501366,0.014072,0.013256,0.0155,0.535608,0.427603,0.524281,0.015207,0.014899,0.016733
4,0,1,,gplus_116899029375914044550,22,1.0,0.467532,9.818182,0.578315,20,...,0.613392,0.00121,0.001252,0.001291,0.61974,0.520816,0.592742,0.01023,0.010867,0.011018
5,0,1,,facebook_3980,44,0.846154,0.145877,6.272727,0.662875,18,...,0.248437,0.099523,0.081906,0.09674,0.444043,0.33617,0.42582,0.040281,0.041605,0.044395
6,0,1,,gplus_103251633033550231172,371,0.994638,0.08366,30.954178,0.954413,180,...,0.234678,0.081663,0.059728,0.079467,0.3231,0.200657,0.289175,0.064176,0.048475,0.064516
7,0,1,,facebook_348,224,1.0,0.127803,28.5,0.784823,99,...,0.279001,0.092527,0.073805,0.090695,0.490279,0.400938,0.472253,0.042528,0.029915,0.040134
8,0,1,,gplus_117412175333096244275,222,0.880952,0.035547,7.855856,1.499441,93,...,0.144642,0.033096,0.023551,0.033829,0.133863,0.123422,0.175297,0.101179,0.080255,0.092483
9,0,1,,gplus_110241952466097562819,83,1.0,0.197179,16.168675,0.957174,68,...,0.436263,0.012588,0.011254,0.01355,0.446764,0.320262,0.448415,0.026166,0.026775,0.028723


In [8]:
df.to_csv('result_small_online.csv')