## We run the methods of the paper on additional social networks
### In this notebook, we run it on large online social networks

In [1]:
# reload imported files automatically without restarting the kernel
%load_ext autoreload
%autoreload 2

In [2]:
from lib import *
from pathcensus import PathCensus
from pathcensus.nullmodels import UBCM
from pathcensus.inference import Inference
from pathcensus.utils import set_seed
from mandarina.benchmark import timer
import networkx as nx


In [3]:
@timer
def calculate_structural_measures(network_name, g, n_samples_null_model):
    """
    This function calculates the structural measures of a network and
    its null model average values by fitting a UBCM null model to the network.
    """
    n_total = g.number_of_nodes()  # get total number of nodes
    # remove self-loops and multiedges and get largest connected component
    g = preprocess_graph(g)
    g
    n_giant = g.number_of_nodes()  # get number of nodes in largest connected component
    degseq = sorted([d for n, d in g.degree()], reverse=True)  # get degree sequence
    dataset = ""
    network_name = network_name.split(".")[0]
    label = "large online"

    properties = {
        "idx": 1,
        "dataset": dataset,
        "name": network_name,
        # "graph": [g],  # get graph
        "n_nodes": n_giant,  # get number of nodes in largest connected component
        "frac_total": n_giant
        / n_total,  # get fraction of nodes in largest connected component
        "density": nx.density(g),  # get density
        "dbar": np.mean(degseq),  # get mean degree
        "dcv": np.std(degseq)
        / np.mean(degseq),  # get coefficient of variation of degree
        "dmax": np.max(degseq),  # get maximum degree
    }

    model = UBCM(g)  # initialize model
    model.fit()  # fit model
    model.validate()  # validate model
    # compare null model to actual graph using statistics function
    infer = Inference(g, model, statistics)
    data, null = infer.init_comparison(n=n_samples_null_model)
    original_network_values = dict(data)
    original_network_values = {
        "data_" + k: v for k, v in original_network_values.items()
    }
    null_model_mean_values = dict(null.mean(numeric_only=True))
    null_model_mean_values = {
        f"null_{n_samples_null_model}_{k}": v for k, v in null_model_mean_values.items()
    }
    return (
        properties
        | null_model_mean_values
        | original_network_values
        | null_model_mean_values
    )


In [4]:
OUTPUT_CSV_FILE_PATH = RESULT_DIR_PATH / "structural_measures_large_online_no_null.csv"


In [5]:
def dataset_size_filter(dataset_path, size):
    """Filter datasets by size, given in bytes."""
    return dataset_path.stat().st_size < size


In [9]:
file_filter = lambda file: file.is_file()
large_online_network_files = list(
    filter(file_filter, (DATA_DIR_PATH / "online" / "large").glob("**/*"))
)

filtered_dataset_paths = list(
    filter(lambda x: dataset_size_filter(x, 80000000), large_online_network_files)
)

print(len(filtered_dataset_paths))

results = []
for i, file in enumerate(
    filtered_dataset_paths
):  # loop over all online large social network files
    f = gml_cleaner(file)  # clean gml file
    g = nx.read_gml(f, label="id")  # load into networkx
    print("Running calculations for network: ", file.name)
    # calculate measures from original network and null model
    result = calculate_structural_measures(
        file.name.split(".")[0], g, n_samples_null_model=1
    )
    # df = pd.DataFrame(result).reset_index()
    # df[df.columns[:-1]].to_csv(OUTPUT_CSV_FILE_PATH, mode='a', header=not OUTPUT_CSV_FILE_PATH.is_file()) # append result row to csv file
    results.append(result)  # collect results


38
Running calculations for network:  NYC_restaurant_tips.gml
Function calculate_structural_measures with args (('NYC_restaurant_tips', <networkx.classes.multigraph.MultiGraph object at 0x7f0d32bb7a00>), {'n_samples_null_model': 1}) took: 3.4436 seconds.
Running calculations for network:  NYC_restaurant_checkin.gml
Function calculate_structural_measures with args (('NYC_restaurant_checkin', <networkx.classes.multigraph.MultiGraph object at 0x7f0d32d108e0>), {'n_samples_null_model': 1}) took: 0.5212 seconds.


In [None]:
df = pd.concat(
    [pd.DataFrame(result) for result in results]
).reset_index()  # combine all data to dataframe for analysis
df = df[df.columns[:-1]]


In [None]:
df


Unnamed: 0,_,idx,dataset,name,n_nodes,frac_total,density,dbar,dcv,dmax,...,null_10_sim_e,null_10_comp_g,null_10_comp,null_10_comp_e,data_sim_g,data_sim,data_sim_e,data_comp_g,data_comp,data_comp_e
0,0,1,,NYC_restaurant_tips,5372,0.838066,0.000614,3.295607,1.569503,196,...,0.003298,0.004624,0.001359,0.002543,0.0,0.0,0.0,0.014416,0.003092,0.006515
1,0,1,,NYC_restaurant_checkin,4906,0.993922,0.001118,5.485936,1.044863,88,...,0.003444,0.004544,0.002013,0.003177,0.0,0.0,0.0,0.01163,0.005808,0.008769
2,0,1,,douban,154908,1.0,2.7e-05,4.223952,2.781348,287,...,0.001298,0.002043,0.000267,0.00127,0.010373,0.001901,0.00775,0.006644,0.000755,0.003354
3,0,1,,old,105091,0.919238,6.5e-05,6.811639,2.068222,692,...,0.000882,0.001637,0.000334,0.000828,0.115086,0.07343,0.123549,0.030024,0.007786,0.018127
4,0,1,,gplus_106382433884876652170,4903,1.0,0.06019,295.052417,1.053766,2476,...,0.202768,0.08068,0.054151,0.077262,0.239926,0.145436,0.219069,0.110713,0.072958,0.106142
5,0,1,,gplus_106228758905254036967,4839,0.99938,0.022479,108.752222,1.153716,1233,...,0.090982,0.065138,0.036881,0.057237,0.301261,0.213295,0.310858,0.060981,0.030823,0.049776
6,0,1,,gplus_116931379084245069738,4829,0.999586,0.038502,185.889832,1.072778,1737,...,0.137314,0.079559,0.048874,0.072868,0.217978,0.141611,0.202476,0.094009,0.055424,0.08447
7,0,1,,gplus_112573107772208475213,3794,0.998684,0.008368,31.738535,1.780662,548,...,0.085765,0.052001,0.020899,0.044149,0.174692,0.0835,0.169532,0.060478,0.02333,0.050768
8,0,1,,gplus_115455024457484679647,3464,0.99255,0.018095,62.663395,1.815547,919,...,0.182763,0.049082,0.022741,0.047532,0.284835,0.112724,0.259228,0.039766,0.023916,0.045025
9,0,1,,gplus_107040353898400532534,4369,1.0,0.071511,312.361639,0.948654,2185,...,0.209179,0.091272,0.063005,0.087199,0.25756,0.169155,0.235457,0.121351,0.081376,0.116609


In [None]:
df.to_csv(RESULT_DIR_PATH / "structural_measures_large_online.csv", index=False)
