In [None]:
from collections import defaultdict
from pathlib import Path
import networkx as nx
from tqdm.notebook import tqdm, trange
import pickle

markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']
colors = [
    '#377eb8',
    '#e41a1c',
    '#4daf4a',
    '#984ea3',
    '#ff7f00',
    '#ffff33',
    '#a65628',
    '#f781bf',
    '#999999',
]

graph_names = [
    # social
    'OF',
    'openflights',
    # hypergraphs
    'coauth-DBLP-proj-graph',
    'coauth-MAG-Geology-proj-graph',
    'threads-ask-ubuntu-proj-graph',
    'threads-math-sx-proj-graph',
    'threads-stack-overflow-proj-graph',
    # temporal
    'sx-askubuntu',
    'sx-mathoverflow',
    'sx-stackoverflow',
    'sx-superuser',
]

graph_names_short = [
    # social
    'OF',
    'FL',
    # hypergraphs
    'co-DB',
    'co-GE',
    'th-UB',
    'th-MA',
    'th-SO',
    # temporal
    'sx-UB',
    'sx-MA',
    'sx-SO',
    'sx-SU',
]

name2nameShort = dict(zip(graph_names, graph_names_short))

g2fitting = {
    # social
    'OF': 1,
    'openflights': 2,
    # hypergraphs
    'coauth-DBLP-proj-graph': 3,
    'coauth-MAG-Geology-proj-graph': 3,
    'threads-ask-ubuntu-proj-graph': 1,
    'threads-math-sx-proj-graph': 1,
    'threads-stack-overflow-proj-graph': 1,
    # temporal
    'sx-askubuntu': 2,
    'sx-mathoverflow': 2,
    'sx-stackoverflow': 2,
    'sx-superuser': 2,
}

g2nm = {
    'OF': (897, 71380),
    'openflights': (2905, 15645),

    'coauth-DBLP-proj-graph': (1654109, 7713116),
    'coauth-MAG-Geology-proj-graph': (898648, 4891112),
    'threads-ask-ubuntu-proj-graph': (82075, 182648),
    'threads-math-sx-proj-graph': (152702, 1088735),
    'threads-stack-overflow-proj-graph': (2301070, 20989078),

    'sx-askubuntu': (152599, 453221),
    'sx-mathoverflow': (24668, 187939),
    'sx-stackoverflow': (2572345, 28177464),
    'sx-superuser': (189191, 712870),
}

gt_c_star_wt1 = {
    'OF': 241,
    'openflights': 64,

    'coauth-DBLP-proj-graph': 83,
    'coauth-MAG-Geology-proj-graph': 74,
    'threads-ask-ubuntu-proj-graph': 73,
    'threads-math-sx-proj-graph': 372,
    'threads-stack-overflow-proj-graph': 685,

    'sx-askubuntu': 152,
    'sx-mathoverflow': 185,
    'sx-stackoverflow': 886,
    'sx-superuser': 202,
}

gt_c_star_more = {
    'OF': [(241, 271), (190, 192), (157, 156), (134, 137), (119, 101)],
    'openflights': [(64, 66), (31, 31), (17, 17), (None, None), (None, None)],
    'coauth-DBLP-proj-graph': [(83, 88), (36, 29), (22, 24), (20, 21), (16, 16)],
    'coauth-MAG-Geology-proj-graph': [(74, 92), (52, 49), (34, 40), (28, 30), (24, 21)],
    'threads-ask-ubuntu-proj-graph': [(73, 87), (30, 31), (19, 20), (18, 11), (15, 11)],
    'threads-math-sx-proj-graph': [(372, 401), (145, 153), (114, 114), (84, 67), (63, 59)],
    'threads-stack-overflow-proj-graph': [(685, 750), (208, 205), (134, 129), (97, 82), (74, 72)],
    'sx-askubuntu': [(152, 149), (63, 69), (48, 42), (36, 27), (31, 22)],
    'sx-mathoverflow': [(185, 181), (113, 102), (75, 63), (60, 49), (51, 41)],
    'sx-stackoverflow': [(886, 749), (407, 324), (221, 203), (169, 130), (120, 103)],
    'sx-superuser': [(202, 206), (96, 93), (63, 54), (48, 37), (36, 27)],
}

p_data = Path(f'data')
p_data.mkdir(exist_ok=True)

p_results = Path('results')
p_results.mkdir(exist_ok=True)

Edge = tuple[int, int]
EdgeAndWeight = tuple[int, int, int]
graphs_sorted_m = sorted(graph_names, key=lambda xx: g2nm[xx][1])


def iter_edges(input_graph, with_weight=False, desc='edges'):
    return tqdm(input_graph.edges.data('weight', default=1) if with_weight else input_graph.edges,
                total=input_graph.number_of_edges(), leave=False, desc=desc)


def iter_nodes(input_graph, desc='nodes'):
    return tqdm(input_graph.nodes, total=input_graph.number_of_nodes(), leave=False, desc=desc)


def data_exist(ds, data_name_, layer_index=None):
    if layer_index is not None:
        return (p_data / data_name_ / f'{ds}.{data_name_}_layer{layer_index}').is_file()
    return (p_data / data_name_ / f'{ds}.{data_name_}').is_file()


def data_file_path(ds, data_name_, layer_index=None, write=False):
    if write:
        (p_data / data_name_).mkdir(exist_ok=True)
        mode = 'wb'
    else:
        mode = 'rb'
    if layer_index is not None:
        return p_data / data_name_ / f'{ds}.{data_name_}_layer{layer_index}', mode
    return p_data / data_name_ / f'{ds}.{data_name_}', mode


def save_data(data, ds, data_name_, layer_index=None):
    with open(*data_file_path(ds, data_name_, write=True, layer_index=layer_index)) as f_:
        pickle.dump(data, f_)


def load_data(ds, data_name_, layer_index=None):
    with open(*data_file_path(ds, data_name_, write=False, layer_index=layer_index)) as f_:
        return pickle.load(f_)


def min_max_tuple(xx, yy):
    return min(xx, yy), max(xx, yy)


def reorder_nodes(input_graph):
    return nx.convert_node_labels_to_integers(input_graph)


def take_gcc(input_graph):
    return input_graph.subgraph(max(nx.connected_components(input_graph), key=len))

In [None]:
from scipy.stats import pearsonr

# Why the number of common neighbors?
metrics = [
    'CN',
    'SA',
    'JC',
    'HP',
    'HD',
    'SI',
    'LI',
    'AA',
    'RA',
    'PA',
    'FM',
    'DL',
    'EC',
    'LP',
]
for graph_name in graphs_sorted_m:
    print(graph_name)
    graph_name_short = name2nameShort[graph_name]
    weights = load_data(graph_name, 'weights')
    y = repeat_list = [int(w > 1) for w in weights]
    m2pearsonResults = dict()
    for metric in metrics:
        try:
            x = metric_list = load_data(graph_name, f'{metric}_list')
            r, p = pearsonr(x, y)
            m2pearsonResults[metric] = (r, p)
            print(graph_name_short, metric, f'Pearson\'s r = {r}')
        except:
            continue

In [None]:
import pickle
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

# observation 1: adjacency and strongness

p_results_obs = p_results / 'obs1-adjacency_and_strongness'
p_results_obs.mkdir(exist_ok=True)

plt.rcParams.update({'font.size': 30})

g2ax = dict()
for graph_name in graphs_sorted_m:
    graph_name_short = name2nameShort[graph_name]
    fig, axes = plt.subplots(1, 5, figsize=(28, 4), constrained_layout=True)
    for i_layer in range(1, 6):
        if i_layer > 1:
            cn2p = load_data(graph_name, 'numberOfCN2numberOfPairs_layers', layer_index=i_layer)
            G_i = load_data(graph_name, 'layers', layer_index=i_layer)
            cn2W = defaultdict(list)
            v2Nv_i = {v: set(G_i[v]) for v in iter_nodes(G_i)}
            cn2m = defaultdict(int)
            cn2M = defaultdict(int)
            for u, v, w in iter_edges(G_i, with_weight=True):
                Nu, Nv = v2Nv_i[u], v2Nv_i[v]
                cn_uv = len(Nu & Nv)
                cn2W[cn_uv].append(w)
                cn2m[cn_uv] += 1
                if w > i_layer:
                    cn2M[cn_uv] += 1
            save_data(dict(cn2W), graph_name, 'numberOfCN2weightList_layers', layer_index=i_layer)
        else:  # i = 1; the original graph
            cn2p = load_data(graph_name, 'numberOfCN2numberOfPairs')
            number_of_CNs_list = load_data(graph_name, 'number_of_CNs_list')
            weights = load_data(graph_name, 'weights')
            cn2W = defaultdict(list)
            cn2m = defaultdict(int)
            cn2M = defaultdict(int)
            for cn, w in zip(number_of_CNs_list, weights):
                cn2W[cn].append(w)
                cn2m[cn] += 1
                if w > i_layer:
                    cn2M[cn] += 1
            save_data(dict(cn2W), graph_name, 'numberOfCN2weightList')
        numOfCN_pair_list = sorted(cn2p)
        numOfCN_edge_list = [c for c in numOfCN_pair_list if cn2m[c] >= 1]
        p_array = np.array([cn2p[cn] for cn in numOfCN_edge_list])
        m_array = np.array([cn2m[cn] for cn in numOfCN_edge_list])
        M_array = np.array([cn2M[cn] for cn in numOfCN_edge_list])

        p_con_array = m_array / p_array
        p_str_array = M_array / m_array

        X = numOfCN_edge_list
        Y_con = p_con_array
        Y_str = p_str_array
        r, _ = pearsonr(Y_con, Y_str)

        axes[i_layer - 1].scatter(X, Y_con * 100, label='adjacent', alpha=0.5)
        axes[i_layer - 1].scatter(X, Y_str * 100, label='strong', alpha=0.5)
        lgd_wt = axes[i_layer - 1].legend(loc='lower right',
                                          # bbox_to_anchor=(1.0, 0.5),
                                          # fontsize=20,
                                          markerscale=2,
                                          borderpad=0.05,
                                          labelspacing=0.2,
                                          handlelength=0.5,
                                          handletextpad=0.2,
                                          borderaxespad=0.2)
        for lh in lgd_wt.legendHandles:
            lh.set_alpha(1)

        xLabel = '# common neighbors'
        yLabel = '% of adj. pairs\nor strong edges'
        axes[i_layer - 1].set_xlabel(xLabel)
        axes[i_layer - 1].set_ylabel(yLabel)
        axes[i_layer - 1].set_yticks([0, 50, 100])
        max_x_tick = ((max(numOfCN_pair_list) * 0.95) // 10) * 10
        axes[i_layer - 1].set_xticks([0, max_x_tick // 2, max_x_tick])
        axes[i_layer - 1].set_title(f'{graph_name_short}\nlayer-{i_layer}, r = {r:.3f}', fontsize=30)
    plt.savefig(p_results_obs / f'{graph_name}.png', bbox_inches='tight')
    plt.savefig(p_results_obs / f'{graph_name}.pdf', bbox_inches='tight')
    plt.clf()

In [None]:
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.linear_model import LinearRegression

# observation 2: the fractions of strong edges
p_results_obs = p_results / 'obs2-FoSEs'
p_results_obs.mkdir(exist_ok=True)

plt.rcParams.update({'font.size': 28})
xLabel = '# common neighbors'
yLabel = '% of strong edges'

for graph_name in graphs_sorted_m:
    print(graph_name)
    graph_name_short = name2nameShort[graph_name]
    fig, ax = plt.subplots(nrows=1, ncols=5, figsize=(28, 4), constrained_layout=True)
    for i_layer in trange(1, 6):
        if i_layer > 1:
            cn2W = load_data(graph_name, 'numberOfCN2weightList_layers', layer_index=i_layer)
        else:
            cn2W = load_data(graph_name, 'numberOfCN2weightList')
        numOfCN_array = np.array(sorted(cn2W))
        numOfEdges_array = np.array([len(cn2W[c]) for c in numOfCN_array])
        m_total = sum(numOfEdges_array)
        numOfSEs_array = np.array([sum(w > i_layer for w in cn2W[c]) for c in numOfCN_array])
        fracOfSEs_array = numOfSEs_array / numOfEdges_array

        i_range = [gt_c_star_more[graph_name][i_layer - 1][1]]
        for i in tqdm(i_range):
            X = numOfCN_array[:i].reshape(-1, 1)
            y = fracOfSEs_array[:i]
            reg = LinearRegression().fit(X, y)
            R2 = reg.score(X, y)
            slope = reg.coef_[0]
            intercept = reg.intercept_
            y_pred = reg.predict(X)

            ax[i_layer - 1].scatter(numOfCN_array, fracOfSEs_array * 100, alpha=0.5, label='GT')
            ax[i_layer - 1].plot(X.flatten(), y_pred * 100, label='linear fitting', color='r', linewidth=3)
            lgd_wt = ax[i_layer - 1].legend(loc='lower right',
                                            # bbox_to_anchor=(1.0, 0.5),
                                            # fontsize=20,
                                            markerscale=2,
                                            borderpad=0.05,
                                            labelspacing=0.2,
                                            handlelength=0.5,
                                            handletextpad=0.2,
                                            borderaxespad=0.2)
            for lh in lgd_wt.legendHandles:
                lh.set_alpha(1)
            ax[i_layer - 1].set_xlabel(xLabel)
            ax[i_layer - 1].set_ylabel(yLabel)

            ax[i_layer - 1].set_title(f'{graph_name_short}, layer-{i_layer}\n' + r'$R^2$ ' + f'= {R2:.3f}', fontsize=28)

            max_x_tick = ((max(numOfCN_array) * 0.95) // 10) * 10
            ax[i_layer - 1].set_xticks([0, max_x_tick // 2, max_x_tick])
            min_y_tick = ((math.ceil(min(fracOfSEs_array) * 100) // 10) + 1) * 10
            ax[i_layer - 1].set_yticks([min_y_tick, (min_y_tick + 100) // 2, 100])
    plt.savefig(p_results_obs / f'{graph_name}.png', bbox_inches='tight')
    plt.savefig(p_results_obs / f'{graph_name}.pdf', bbox_inches='tight')
    plt.clf()

In [None]:
import pickle
from pathlib import Path
import networkx
import numpy as np
from collections import defaultdict, Counter
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
from scipy.stats import linregress
from matplotlib.ticker import StrMethodFormatter

# observation 3: a power law across layers
p_results_obs = p_results / 'obs3-power-law'
p_results_obs.mkdir(exist_ok=True)

plt.rcParams.update({'font.size': 20})
plt.figure()

for graph_name in graphs_sorted_m:
    print(graph_name)
    graph_name_short = name2nameShort[graph_name]
    X = []
    Y = []
    edges_and_weights = load_data(graph_name, 'edges_and_weights')
    w2cnt = Counter([w for _, _, w in edges_and_weights])
    if graph_name == 'openflights':
        i_layer_range = trange(1, 5)
    elif graph_name == 'OF':
        i_layer_range = trange(2, 12)
    else:
        i_layer_range = trange(1, 11)
    for i_layer in i_layer_range:
        if i_layer > 1:
            cn2W = load_data(graph_name, 'numberOfCN2weightList_layers', layer_index=i_layer)
        else:
            cn2W = load_data(graph_name, 'numberOfCN2weightList')
        weights_CN0 = cn2W[0]
        m0 = len(weights_CN0)
        M0 = sum(w > i_layer for w in weights_CN0)
        m_wt = sum(cnt_w for w, cnt_w in w2cnt.items() if w >= i_layer)
        M_wt = m_wt - w2cnt[i_layer]
        proStrong_total = M_wt / m_wt
        proStrong_CN0 = M0 / m0
        X, Y = list(X), list(Y)
        X.append(proStrong_total)
        Y.append(proStrong_CN0)
    for ii, (x, y) in enumerate(zip(X, Y)):
        plt.scatter(x, y, marker=markers[ii % 7], label=f'{ii + 2}' if graph_name == 'OF' else f'{ii + 1}', s=200)
    X, Y = np.array(X), np.array(Y)
    # log-log linear regression
    linreg = linregress(np.log(X), np.log(Y))
    R2 = linreg.rvalue ** 2
    plt.plot(X, np.exp(linreg.intercept + linreg.slope * np.log(X)), 'r')
    plt.title(f'{graph_name_short}\n'
              f'log(Y) = {linreg.intercept:.3f} + {linreg.slope:.3f} * log(X)\n$R^2$ = {R2:.2f}')
    plt.xlabel('frac. of strong edges among all edges')
    plt.ylabel('frac. of strong edges\namong edges w/o\ncommon neighbors')
    plt.xscale('log')
    plt.yscale('log')
    if graph_name_short == 'OF':
        # plt.sca(ax[ax_i])
        x_list = [0.75, 0.822, 0.9]
        y_list = [0.5, 0.67, 0.898]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'FL':
        # plt.sca(ax[ax_i])
        x_list = [0.3, 0.342, 0.39]
        y_list = [0.14, 0.182, 0.237]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'co-DB':
        # plt.sca(ax[ax_i])
        x_list = [0.32, 0.48, 0.72]
        y_list = [0.16, 0.32, 0.72]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'co-GE':
        # plt.sca(ax[ax_i])
        x_list = [0.24, 0.36, 0.54, 0.81]
        y_list = [0.12, 0.3, 0.75]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'th-UB':
        # plt.sca(ax[ax_i])
        x_list = [0.036, 0.144, 0.576]
        y_list = [0.002, 0.032, 0.512]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'th-MA':
        # plt.sca(ax[ax_i])
        x_list = [0.16, 0.32, 0.64]
        y_list = [0.01, 0.04, 0.16, 0.64]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'th-SO':
        # plt.sca(ax[ax_i])
        x_list = [0.05, 0.22, 0.968]
        y_list = [0.01, 0.095, 0.903]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'sx-UB':
        # plt.sca(ax[ax_i])
        x_list = [0.3, 0.48, 0.768]
        y_list = [0.2, 0.36, 0.648]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'sx-MA':
        # plt.sca(ax[ax_i])
        x_list = [0.4, 0.56, 0.784]
        y_list = [0.2, 0.36, 0.648]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'sx-SO':
        # plt.sca(ax[ax_i])
        x_list = [0.32, 0.48, 0.72]
        y_list = [0.32, 0.48, 0.72]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    elif graph_name_short == 'sx-SU':
        # plt.sca(ax[ax_i])
        x_list = [0.32, 0.48, 0.72]
        y_list = [0.24, 0.36, 0.54, 0.81]
        plt.xticks(x_list, list(map(lambda x_: str(x_)[1:], x_list)))
        plt.yticks(y_list, list(map(lambda x_: str(x_)[1:], y_list)))
    plt.minorticks_off()
    plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), labelspacing=1.0)
    plt.savefig(p_results_obs / f'{graph_name}.png', bbox_inches='tight')
    plt.savefig(p_results_obs / f'{graph_name}.pdf', bbox_inches='tight')
    plt.clf()
