In [17]:
from collections import defaultdict
from pathlib import Path
import networkx as nx
from tqdm.notebook import tqdm, trange
import pickle

markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']
colors = [
    '#377eb8',
    '#e41a1c',
    '#4daf4a',
    '#984ea3',
    '#ff7f00',
    '#ffff33',
    '#a65628',
    '#f781bf',
    '#999999',
]

graph_names = [
    # social
    'OF',
    'openflights',
    # hypergraphs
    'coauth-DBLP-proj-graph',
    'coauth-MAG-Geology-proj-graph',
    'threads-ask-ubuntu-proj-graph',
    'threads-math-sx-proj-graph',
    'threads-stack-overflow-proj-graph',
    # temporal
    'sx-askubuntu',
    'sx-mathoverflow',
    'sx-stackoverflow',
    'sx-superuser',
]

graph_names_short = [
    # social
    'OF',
    'FL',
    # hypergraphs
    'co-DB',
    'co-GE',
    'th-UB',
    'th-MA',
    'th-SO',
    # temporal
    'sx-UB',
    'sx-MA',
    'sx-SO',
    'sx-SU',
]

name2nameShort = dict(zip(graph_names, graph_names_short))

g2fitting = {
    # social
    'OF': 1,
    'openflights': 2,
    # hypergraphs
    'coauth-DBLP-proj-graph': 3,
    'coauth-MAG-Geology-proj-graph': 3,
    'threads-ask-ubuntu-proj-graph': 1,
    'threads-math-sx-proj-graph': 1,
    'threads-stack-overflow-proj-graph': 1,
    # temporal
    'sx-askubuntu': 2,
    'sx-mathoverflow': 2,
    'sx-stackoverflow': 2,
    'sx-superuser': 2,
}

i2ak = {
    1: (0.7, 1.3),
    2: (0.9, 1.1),
    3: (0.99, 1.05)
}

g2nm = {
    'OF': (897, 71380),
    'openflights': (2905, 15645),

    'coauth-DBLP-proj-graph': (1654109, 7713116),
    'coauth-MAG-Geology-proj-graph': (898648, 4891112),
    'threads-ask-ubuntu-proj-graph': (82075, 182648),
    'threads-math-sx-proj-graph': (152702, 1088735),
    'threads-stack-overflow-proj-graph': (2301070, 20989078),

    'sx-askubuntu': (152599, 453221),
    'sx-mathoverflow': (24668, 187939),
    'sx-stackoverflow': (2572345, 28177464),
    'sx-superuser': (189191, 712870),
}

gt_c_star_wt1 = {
    'OF': 241,
    'openflights': 64,

    'coauth-DBLP-proj-graph': 83,
    'coauth-MAG-Geology-proj-graph': 74,
    'threads-ask-ubuntu-proj-graph': 73,
    'threads-math-sx-proj-graph': 372,
    'threads-stack-overflow-proj-graph': 685,

    'sx-askubuntu': 152,
    'sx-mathoverflow': 185,
    'sx-stackoverflow': 886,
    'sx-superuser': 202,
}

gt_c_star_more = {
    'OF': [(241, 271), (190, 192), (157, 156), (134, 137), (119, 101)],
    'openflights': [(64, 66), (31, 31), (17, 17), (None, None), (None, None)],
    'coauth-DBLP-proj-graph': [(83, 88), (36, 29), (22, 24), (20, 21), (16, 16)],
    'coauth-MAG-Geology-proj-graph': [(74, 92), (52, 49), (34, 40), (28, 30), (24, 21)],
    'threads-ask-ubuntu-proj-graph': [(73, 87), (30, 31), (19, 20), (18, 11), (15, 11)],
    'threads-math-sx-proj-graph': [(372, 401), (145, 153), (114, 114), (84, 67), (63, 59)],
    'threads-stack-overflow-proj-graph': [(685, 750), (208, 205), (134, 129), (97, 82), (74, 72)],
    'sx-askubuntu': [(152, 149), (63, 69), (48, 42), (36, 27), (31, 22)],
    'sx-mathoverflow': [(185, 181), (113, 102), (75, 63), (60, 49), (51, 41)],
    'sx-stackoverflow': [(886, 749), (407, 324), (221, 203), (169, 130), (120, 103)],
    'sx-superuser': [(202, 206), (96, 93), (63, 54), (48, 37), (36, 27)],
}

p_data = Path(f'data')
p_data.mkdir(exist_ok=True)

p_results = Path('results')
p_results.mkdir(exist_ok=True)

Edge = tuple[int, int]
EdgeAndWeight = tuple[int, int, int]
graphs_sorted_m = sorted(graph_names, key=lambda xx: g2nm[xx][1])


def iter_edges(input_graph, with_weight=False, desc='edges'):
    return tqdm(input_graph.edges.data('weight', default=1) if with_weight else input_graph.edges,
                total=input_graph.number_of_edges(), leave=False, desc=desc)


def iter_nodes(input_graph, desc='nodes'):
    return tqdm(input_graph.nodes, total=input_graph.number_of_nodes(), leave=False, desc=desc)


def data_exist(ds, data_name_, layer_index=None):
    if layer_index is not None:
        return (p_data / data_name_ / f'{ds}.{data_name_}_layer{layer_index}').is_file()
    return (p_data / data_name_ / f'{ds}.{data_name_}').is_file()


def data_file_path(ds, data_name_, layer_index=None, write=False):
    if write:
        (p_data / data_name_).mkdir(exist_ok=True)
        mode = 'wb'
    else:
        mode = 'rb'
    if layer_index is not None:
        return p_data / data_name_ / f'{ds}.{data_name_}_layer{layer_index}', mode
    return p_data / data_name_ / f'{ds}.{data_name_}', mode


def save_data(data, ds, data_name_, layer_index=None):
    with open(*data_file_path(ds, data_name_, write=True, layer_index=layer_index)) as f_:
        pickle.dump(data, f_)


def load_data(ds, data_name_, layer_index=None):
    with open(*data_file_path(ds, data_name_, write=False, layer_index=layer_index)) as f_:
        return pickle.load(f_)


def min_max_tuple(xx, yy):
    return min(xx, yy), max(xx, yy)


def reorder_nodes(input_graph):
    return nx.convert_node_labels_to_integers(input_graph)


def take_gcc(input_graph):
    return input_graph.subgraph(max(nx.connected_components(input_graph), key=len))


In [None]:
import numpy as np
from scipy.optimize import fsolve
import math
from itertools import combinations, product

# PEAR
p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_PEAR = p_experiments / 'PEAR'
p_experiments_PEAR.mkdir(exist_ok=True)

random_seeds = [1, 2, 3]

for graph_name, random_seed in product(graphs_sorted_m, random_seeds):
    np.random.seed(random_seed)
    print(graph_name)
    a, k = i2ak[g2fitting[graph_name]]
    graph_name_short = name2nameShort[graph_name]
    G = load_data(graph_name, 'graph')
    e2predWeight = dict()
    for u, v in iter_edges(G):
        e2predWeight[min_max_tuple(u, v)] = 1

    for i_layer in range(2, 6):
        v2Nv = {v: set(G[v]) for v in G}
        cn2p = defaultdict(int)
        cn2m = defaultdict(int)
        e2cn = dict()
        n, m = G.number_of_nodes(), G.number_of_edges()
        print(graph_name_short, f'layer-{i_layer}', G)
        if i_layer == 2:
            tilde_c_star = gt_c_star_wt1[graph_name]
            for u, v in iter_edges(G):
                Nu, Nv = v2Nv[u], v2Nv[v]
                cn_uv = len(Nu & Nv)
                cn2m[cn_uv] += 1
                e2cn[min_max_tuple(u, v)] = cn_uv
        else:
            for u, v in tqdm(combinations(v2Nv, 2), total=math.comb(n, 2)):
                Nu, Nv = v2Nv[u], v2Nv[v]
                cn_uv = len(Nu & Nv)
                cn2p[cn_uv] += 1
                if v in Nu:
                    cn2m[cn_uv] += 1
                    e2cn[min_max_tuple(u, v)] = cn_uv
            try:
                tilde_c_star = min(c for c in cn2p if 0 < cn2p[c] == cn2m[c])
            except ValueError:
                break
        print(f'tilde_c_star = {tilde_c_star}')


        # compute number of strong edges
        def solve_se(se_input: float) -> float:
            LHS = a * (se_input / m) ** k
            RHS_num = se_input - sum(cn2m[c] * min(1., c / tilde_c_star) for c in cn2m)
            RHS_den = sum(cn2m[c] * (tilde_c_star - c) / tilde_c_star for c in cn2m if c <= tilde_c_star)
            RHS = RHS_num / RHS_den
            return LHS - RHS


        se_root = float(fsolve(solve_se, np.array(0.1 * m))[0])
        print(f'number of SEs = {se_root}, numerical difference = {solve_se(se_root):.4f}')

        # compute strong fractions
        cn2sf = defaultdict(float)
        p0 = cn2sf[0] = a * (se_root / m) ** k
        for c in cn2m:
            if c:
                cn2sf[c] = min(1., p0 + (1 - p0) * c / tilde_c_star)

        # sampling
        strong_edges = []
        for e, cn_e in e2cn.items():
            p_e = cn2sf[cn_e]
            if np.random.random() <= p_e:
                strong_edges.append(e)
                e2predWeight[e] = i_layer
        G = nx.from_edgelist(strong_edges)
        # save
        with open(p_experiments_PEAR / f'{graph_name}-seed{random_seed}.G_{i_layer}', 'wb') as f:
            pickle.dump(G, f)
    with open(p_experiments_PEAR / f'{graph_name}-seed{random_seed}.e2predWeight', 'wb') as f:
        pickle.dump(e2predWeight, f)

In [None]:
from sklearn.linear_model import LinearRegression

# PRD (purely random)
p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_PRD = p_experiments / 'PRD'
p_experiments_PRD.mkdir(exist_ok=True)

random_seed = 42
np.random.seed(random_seed)

for graph_name in graphs_sorted_m:
    print(graph_name)
    Ei_list = []
    for i_layer in range(2, 6):
        G = load_data(graph_name, 'layers', layer_index=i_layer)
        Ei_list.append(G.number_of_edges())
    i_list = list(range(2, 6))
    X = np.array(i_list)
    Y = np.array(Ei_list)
    reg = LinearRegression(fit_intercept=True)
    reg.fit(np.log(X).reshape(-1, 1), np.log(Y))
    slope = reg.coef_[0]
    intercept = reg.intercept_
    # print(f'fitting slope = {slope:.3f}, intercept = {intercept:.3f}')
    Y_fitted = np.exp(reg.predict(np.log(X).reshape(-1, 1)))
    Y_fitted = Y_fitted.reshape(-1)
    i2Ei_fitted = {i: Ei_fitted for i, Ei_fitted in zip(i_list, Y_fitted)}
    i2Ei_fitted[1] = g2nm[graph_name][1]
    with open(*data_file_path(graph_name, 'edges')) as f:
        edges: list[Edge] = pickle.load(f)
    e2predWeight = dict()
    for u, v in tqdm(edges):
        e2predWeight[min_max_tuple(u, v)] = 1

    for i_layer in i_list:
        # sampling
        number_SE = round(i2Ei_fitted[i_layer])
        strong_edges_indices = np.random.choice(len(edges), number_SE, replace=False)
        strong_edges = []
        for ie in strong_edges_indices:
            e = edges[ie]
            strong_edges.append(e)
            e2predWeight[e] = i_layer
        G = nx.from_edgelist(strong_edges)
        edges = strong_edges[:]
        # save
        with open(p_experiments_PRD / f'{graph_name}.G_{i_layer}', 'wb') as f:
            pickle.dump(G, f)

    with open(p_experiments_PRD / f'{graph_name}.e2predWeight', 'wb') as f:
        pickle.dump(e2predWeight, f)

In [None]:
from itertools import chain

# SCN (sorting-CN)
p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_SCN = p_experiments / 'SCN'
p_experiments_SCN.mkdir(exist_ok=True)

random_seed = 42
np.random.seed(random_seed)

for graph_name in graphs_sorted_m:
    print(graph_name)
    Ei_list = []
    for i_layer in range(2, 6):
        G = load_data(graph_name, 'layers', layer_index=i_layer)
        Ei_list.append(G.number_of_edges())
    i_list = list(range(2, 6))
    X = np.array(i_list)
    Y = np.array(Ei_list)
    reg = LinearRegression(fit_intercept=True)
    reg.fit(np.log(X).reshape(-1, 1), np.log(Y))
    slope = reg.coef_[0]
    intercept = reg.intercept_
    # print(f'fitting slope = {slope:.3f}, intercept = {intercept:.3f}')
    Y_fitted = np.exp(reg.predict(np.log(X).reshape(-1, 1)))
    Y_fitted = Y_fitted.reshape(-1)
    i2Ei_fitted = {i: round(Ei_fitted) for i, Ei_fitted in zip(i_list, Y_fitted)}
    i2Ei_fitted[1] = g2nm[graph_name][1]
    edges = load_data(graph_name, 'edges')
    number_of_CNs_list = load_data(graph_name, 'number_of_CNs_list')
    edges_sorted_CN = [x for _, x in sorted(zip(number_of_CNs_list, edges), reverse=True)]
    weights_sorted = list(chain.from_iterable([[i] * (i2Ei_fitted[i] - i2Ei_fitted.get(i + 1, 0))
                                               for i in reversed(i_list)]))
    e2predWeight = dict(zip(edges_sorted_CN, weights_sorted))
    for i_layer in i_list:
        # sampling
        strong_edges = [e for e in edges if e2predWeight[e] >= i_layer]
        G = nx.from_edgelist(strong_edges)
        edges = strong_edges[:]
        # save
        with open(p_experiments_SCN / f'{graph_name}.G_{i_layer}', 'wb') as f:
            pickle.dump(G, f)

    with open(p_experiments_SCN / f'{graph_name}.e2predWeight', 'wb') as f:
        pickle.dump(e2predWeight, f)

In [None]:
from karateclub import RandNE

# SEB (sort-embedding)
p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_SEB = p_experiments / 'SEB'
p_experiments_SEB.mkdir(exist_ok=True)

random_seed = 42
np.random.seed(random_seed)

for graph_name in graphs_sorted_m:
    print(graph_name)
    Ei_list = []
    for i_layer in range(2, 6):
        G = load_data(graph_name, 'layers', layer_index=i_layer)
        Ei_list.append(G.number_of_edges())
    i_list = list(range(2, 6))
    X = np.array(i_list)
    Y = np.array(Ei_list)
    reg = LinearRegression(fit_intercept=True)
    reg.fit(np.log(X).reshape(-1, 1), np.log(Y))
    slope = reg.coef_[0]
    intercept = reg.intercept_
    # print(f'fitting slope = {slope:.3f}, intercept = {intercept:.3f}')
    Y_fitted = np.exp(reg.predict(np.log(X).reshape(-1, 1)))
    Y_fitted = Y_fitted.reshape(-1)
    i2Ei_fitted = {i: round(Ei_fitted) for i, Ei_fitted in zip(i_list, Y_fitted)}
    i2Ei_fitted[1] = g2nm[graph_name][1]
    edges = load_data(graph_name, 'edges')
    G = load_data(graph_name, 'graph')
    try:
        embed_RandNE = load_data(graph_name, 'embed_RandNE')
    except:
        model = RandNE(dimensions=32, seed=random_seed)
        model.fit(G)
        embed_RandNE = model.get_embedding()
        save_data(embed_RandNE, graph_name, 'embed_RandNE')
    similarity_list = [dot(embed_RandNE[u], embed_RandNE[v]) for u, v in tqdm(edges, desc='similarity')]
    edges_sorted_sim = [x for _, x in sorted(zip(similarity_list, edges), reverse=True)]
    weights_sorted = list(chain.from_iterable([[i] * (i2Ei_fitted[i] - i2Ei_fitted.get(i + 1, 0))
                                               for i in reversed(i_list)]))
    e2predWeight = dict(zip(edges_sorted_sim, weights_sorted))
    for i_layer in range(2, 6):
        # sampling
        strong_edges = [e for e in edges if e2predWeight[e] >= i_layer]
        G = nx.from_edgelist(strong_edges)
        edges = strong_edges[:]
        # save
        with open(p_experiments_SEB / f'{graph_name}.G_{i_layer}', 'wb') as f:
            pickle.dump(G, f)
    with open(p_experiments_SEB / f'{graph_name}.e2predWeight', 'wb') as f:
        pickle.dump(e2predWeight, f)

In [None]:
# PEB (probility-embedding)

p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_PEB = p_experiments / 'PEB'
p_experiments_PEB.mkdir(exist_ok=True)

random_seed = 42
np.random.seed(random_seed)

for graph_name in graphs_sorted_m:
    print(graph_name)
    Ei_list = []
    for i_layer in range(2, 6):
        G = load_data(graph_name, 'layers', layer_index=i_layer)
        Ei_list.append(G.number_of_edges())
    i_list = list(range(2, 6))
    X = np.array(i_list)
    Y = np.array(Ei_list)
    reg = LinearRegression(fit_intercept=True)
    reg.fit(np.log(X).reshape(-1, 1), np.log(Y))
    slope = reg.coef_[0]
    intercept = reg.intercept_
    # print(f'fitting slope = {slope:.3f}, intercept = {intercept:.3f}')
    Y_fitted = np.exp(reg.predict(np.log(X).reshape(-1, 1)))
    Y_fitted = Y_fitted.reshape(-1)
    i2Ei_fitted = {i: round(Ei_fitted) for i, Ei_fitted in zip(i_list, Y_fitted)}
    i2Ei_fitted[1] = g2nm[graph_name][1]
    edges = load_data(graph_name, 'edges')
    try:
        embed_RandNE = load_data(graph_name, 'embed_RandNE')
    except:
        model = RandNE(dimensions=32, seed=random_seed)
        model.fit(G)
        embed_RandNE = model.get_embedding()
        save_data(embed_RandNE, graph_name, 'embed_RandNE')
    similarity_list = [math.exp(dot(embed_RandNE[u] / 1000, embed_RandNE[v] / 1000))
                       for u, v in tqdm(edges, desc='similarity')]
    edges_sorted_sim = [x for _, x in sorted(zip(similarity_list, edges), reverse=True)]
    e2predWeight = {e: 1 for e in edges}
    sim_sum = sum(similarity_list)
    similarity_list_normed = [s / sim_sum for s in similarity_list]
    indices = list(range(len(edges)))
    for i_layer in range(2, 6):
        # sampling
        strong_edges = []
        strong_indices = []
        total_prob = 0.
        similarity_list_normed = []
        for i in tqdm(indices):
            sim_i = similarity_list_normed[i]
            total_prob += sim_i
            similarity_list_normed.append(sim_i)
        sim_sum_wt = sum(similarity_list_normed)
        similarity_list_normed = [s / sim_sum_wt for s in similarity_list_normed]
        print(sum(similarity_list_normed), min(similarity_list_normed), max(similarity_list_normed))
        strong_indices = np.random.choice(len(similarity_list_normed), i2Ei_fitted[wt + 1],
                                          p=similarity_list_normed)
        strong_edges = [edges[i] for i in strong_indices]
        G = nx.from_edgelist(strong_edges)
        indices = strong_indices[:]
        # save
        with open(p_experiments_PEB / f'{graph_name}.G_{i_layer}', 'wb') as f:
            pickle.dump(G, f)
    with open(p_experiments_PEB / f'{graph_name}.e2predWeight', 'wb') as f:
        pickle.dump(e2predWeight, f)

In [None]:
# RFF (random forest-feature)

p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_RFF = p_experiments / 'RFF'
p_experiments_RFF.mkdir(exist_ok=True)

random_seed = 42
np.random.seed(random_seed)

metric_names = [
    'CN',
    'SA',
    'JC',
    'HP',
    'HD',
    'SI',
    'LI',
    'AA',
    'RA',
    'PA',
    'FM',
    'DL',
    'EC',
    'LP',
]

for graph_name in graphs_sorted_m:
    print(graph_name)
    edges = load_data(graph_name, 'edges')
    weights = load_data(graph_name, 'weights')
    try:
        part2indices = load_data(graph_name, 'part2indices')
    except:
        edges_and_weights = load_data(graph_name, 'edges_and_weights')
        weight2indices = defaultdict(list)
        for i, (u, v, w) in enumerate(edge_weights):
            if w >= 5:
                w = 5
            weight2indices[w].append(i)
        train_val_test_mask = [None] * len(edge_weights)
        part2indices = defaultdict(list)
        for w, indices_w in weight2indices.items():
            val_size = train_size = len(indices_w) // 10
            indices_perm = list(np.random.permutation(indices_w))
            part2indices['train'] += indices_perm[:train_size]
            part2indices['val'] += indices_perm[train_size:train_size + val_size]
            part2indices['test'] += indices_perm[train_size + val_size:]
        part2indices = dict(part2indices)
        save_data(part2indices, graph_name, 'part2indices')
    weights = [int(min(5, w)) for w in weights]
    feature_matrix = []
    for m in metric_names:
        feature_matrix.append(load_data(graph_name, f'{m}_list'))
    feature_matrix = np.transpose(feature_matrix)
    indices_train = part2indices['train']
    X = feature_matrix_train = feature_matrix[indices_train]
    y = weights_train = [weights[i] for i in indices_train]
    clf = RandomForestClassifier(n_estimators=32, max_depth=2, random_state=42)
    clf.fit(X, y)
    weights_pred = clf.predict(feature_matrix)
    e2predWeight = dict(zip(edges, weights_pred))
    for i_layer in range(2, 6):
        # sampling
        strong_edges = [e for e in edges if e2predWeight[e] >= i_layer]
        G = nx.from_edgelist(strong_edges)
        edges = strong_edges[:]
        # save
        with open(p_experiments_RFF / f'{graph_name}.G_{i_layer}', 'wb') as f:
            pickle.dump(G, f)
    with open(p_experiments_RFF / f'{graph_name}.e2predWeight', 'wb') as f:
        pickle.dump(e2predWeight, f)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import copy
import numpy as np
import random

# NEB (neural network-embedding)
p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_NEB = p_experiments / 'NEB'
p_experiments_NEB.mkdir(exist_ok=True)

random_seed = 42
np.random.seed(random_seed)


class edge_classifier(nn.Module):
    def __init__(self, dim_input):
        super().__init__()
        self.bilinear = nn.Bilinear(dim_input, dim_input, 5)

    def forward(self, u, v):
        return self.bilinear(u, v)


def get_embedding(base_path, _path):
    return load_data(_path, 'embed_RandNE')


def read_file(base_path, _path):
    _file = load_data(_path, 'edges_and_weights')
    e_list = list()
    w_list = list()
    for (src, dst, w) in tqdm(_file):
        if w >= 5:
            w = 5
        e_list.append((src, dst))
        # weight 1 2 3 4 5 --> 0 1 2 3 4
        w_list.append(int(w) - 1)
    return e_list, w_list


def get_split(base_path, _path):
    _file = load_data(_path, 'part2indices')
    print(len(_file['train']), len(_file['val']), len(_file['test']))
    return _file['train'], _file['val'], _file['test']


def get_loader(_list, _b_size, shuffle):
    if shuffle:
        random.shuffle(_list)
    _loader = []
    _temp = len(_list) // _b_size

    for _i in range(_temp):
        _loader.append(_list[_i * _b_size:(_i + 1) * _b_size])
    _loader.append(_list[-(len(_list) % _b_size):])

    return _loader


def train(BASE_PATH, dataset_name, batch_size, num_epochs=200, _lr=0.0001, default_device="cuda:0"):
    emb = get_embedding(BASE_PATH, dataset_name)
    edge_list, w_list = read_file(BASE_PATH, dataset_name)
    #     len(edge_list), len(w_list)
    train_split, val_split, test_split = get_split(BASE_PATH, dataset_name)

    train_loader = get_loader(train_split, batch_size, shuffle=True)
    val_loader = get_loader(val_split, batch_size, shuffle=False)
    test_loader = get_loader(test_split, batch_size, shuffle=False)

    print("Len train/val/test loader {} {} {}".format(len(train_loader), len(val_loader), len(test_loader)))

    edge_list = torch.tensor(edge_list)
    w_list = torch.tensor(w_list).to(default_device)
    ec = edge_classifier(32).to(default_device)
    opt = torch.optim.Adam(ec.parameters(), lr=_lr)

    train_loss = 0.
    valid_loss = 0.

    valid_iter = 5

    train_loss_list = []
    valid_loss_list = []
    best_valid_loss = 999
    best_epoch = -1

    for _epoch in tqdm(range(num_epochs)):
        train_loader = get_loader(train_split, batch_size, shuffle=True)
        for _step, e_ids in enumerate(train_loader):
            ec.train()
            opt.zero_grad()
            '''
                _batch: list of edge indices
            '''
            _src = edge_list[e_ids][:, 0].tolist()
            _dst = edge_list[e_ids][:, 1].tolist()
            _src = torch.tensor(emb[_src]).float().to(default_device)
            _dst = torch.tensor(emb[_dst]).float().to(default_device)
            #             print(_src, _dst)
            preds = ec(_src, _dst).to(default_device)
            _w = w_list[e_ids]

            loss = F.cross_entropy(preds, _w)
            loss.backward()
            opt.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        train_loss_list.append(train_loss)
        train_loss = 0.
        #         print("Epoch {} Train loss {}".format(_step, train_loss))

        if _epoch % valid_iter == 0:
            for e_ids in val_loader:
                ec.eval()
                _src = edge_list[e_ids][:, 0].tolist()
                _dst = edge_list[e_ids][:, 1].tolist()
                _src = torch.tensor(emb[_src]).float().to(default_device)
                _dst = torch.tensor(emb[_dst]).float().to(default_device)
                preds = ec(_src, _dst).to(default_device)
                _w = w_list[e_ids]

                loss = F.cross_entropy(preds, _w)
                valid_loss += loss.item()
            valid_loss /= len(val_loader)

            if valid_loss < best_valid_loss:
                best_epoch = _epoch
                best_valid_loss = valid_loss
                best_model = copy.deepcopy(ec)

            valid_loss_list.append(valid_loss)
            valid_loss = 0.
    #     print("Epoch {} Val loss {}".format(_epoch, valid_loss))

    test_loss = 0.
    test_src_list = []
    test_dst_list = []
    test_pred_list = []
    for e_ids in test_loader:
        best_model.eval()
        _src = edge_list[e_ids][:, 0].tolist()
        _dst = edge_list[e_ids][:, 1].tolist()
        test_src_list += _src
        test_dst_list += _dst
        _src = torch.tensor(emb[_src]).float().to(default_device)
        _dst = torch.tensor(emb[_dst]).float().to(default_device)
        preds = best_model(_src, _dst).to(default_device)
        test_pred_list += (torch.argmax(preds, dim=-1) + 1).tolist()
        _w = w_list[e_ids]

        loss = F.cross_entropy(preds, _w)
        test_loss += loss.item()
    test_loss /= len(test_loader)
    plt.plot(range(num_epochs), train_loss_list)
    plt.title("Train loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

    plt.plot(range(num_epochs // valid_iter), valid_loss_list)
    plt.title("Validation loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()
    print("Best epoch {} Val loss {} Test loss {}".format(best_epoch, best_valid_loss, test_loss))

    '''
        predict for train/valid set
    '''
    tv_src_list = []
    tv_dst_list = []
    tv_pred_list = []
    for e_ids in (train_loader + val_loader):
        best_model.eval()
        _src = edge_list[e_ids][:, 0].tolist()
        _dst = edge_list[e_ids][:, 1].tolist()
        tv_src_list += _src
        tv_dst_list += _dst
        _src = torch.tensor(emb[_src]).float().to(default_device)
        _dst = torch.tensor(emb[_dst]).float().to(default_device)
        preds = best_model(_src, _dst).to(default_device)
        tv_pred_list += (torch.argmax(preds, dim=-1) + 1).tolist()

    return best_model, tv_src_list, tv_dst_list, tv_pred_list, test_src_list, test_dst_list, test_pred_list


def save_result(dataset_name, tv_src_list, tv_dst_list, tv_pred_list, test_src_list, test_dst_list, test_pred_list):
    len_tv = len(tv_src_list)
    len_test = len(test_src_list)
    p_raw_results = p_experiments_NEB / 'raw_results'
    with open(p_raw_results / f'{dataset_name}.txt', 'w+') as f:
        for _idx in range(len_tv):
            f.write(str(tv_src_list[_idx]) + "," + str(tv_dst_list[_idx]) + "," + str(tv_pred_list[_idx]) + "\n")
        for _idx in range(len_test):
            f.write(str(test_src_list[_idx]) + "," + str(test_dst_list[_idx]) + "," + str(test_pred_list[_idx]) + "\n")


BASE_PATH = "./"
for graph_name in graphs_sorted_m:
    print(graph_name)
    best_model, tv_src_list, tv_dst_list, tv_pred_list,
    test_src_list, test_dst_list, test_pred_list = train(BASE_PATH,
                                                         graph_name,
                                                         batch_size=128,
                                                         num_epochs=200,
                                                         _lr=0.0005,
                                                         default_device="cuda:1")
    save_result(graph_name, tv_src_list, tv_dst_list, tv_pred_list, test_src_list, test_dst_list, test_pred_list)
    # p_experiments_NEB / 'raw_results' / f'{graph_name}.txt'
    with open(p_experiments_NEB / 'raw_results' / f'{graph_name}.txt') as f:
        dd = f.readlines()
    e2predWeight = dict()
    for d in dd:
        u, v, w = map(int, d.split(','))
        e2predWeight[min_max_tuple(u, v)] = w

    for i_layer in range(2, 6):
        # sampling
        strong_edges = [(u, v) for u, v in edges if e2predWeight[min_max_tuple(u, v)] >= i_layer]
        G = nx.from_edgelist(strong_edges)
        edges = strong_edges[:]
        # save
        with open(p_experiments_NEB / f'{graph_name}.G_{i_layer}', 'wb') as f:
            pickle.dump(G, f)
    with open(p_experiments_NEB / f'{graph_name}.e2predWeight', 'wb') as f:
        pickle.dump(e2predWeight, f)

In [None]:
import os
import pickle
import sys
import matplotlib.pyplot as plt
from scipy.stats import stats, pearsonr, ks_2samp
from netrd.distance import NetSimile

# check the results

p_experiments = Path('experiments')
p_experiments.mkdir(exist_ok=True)

p_experiments_results = p_experiments / 'results'
p_experiments_results.mkdir(exist_ok=True)

random_seed = 42
np.random.seed(random_seed)

methods = [
    'PEAR',
    'PRD',
    'CSN',
    'SEB',
    'PEB',
    'RFF',
    'NEB',
]

with open(p_experiments_results / 'results.txt', 'a+') as f_out:
    for graph_name in graphs_sorted_m:
        if 'coauth' not in graph_name:
            continue
        print(graph_name)
        for i_layer in range(2, 6):
            print(f'{graph_name}, layer-{i_layer}')
            print(f'{graph_name}, layer-{i_layer}', file=f_out)
            G_i = load_data(graph_name, 'layers', layer_index=i_layer)
            v2Nv_GT = dict()
            v2dv_GT = dict()
            for v in G_i:
                Nv = set(G_i[v])
                v2Nv_GT[v] = Nv
                v2dv_GT[v] = len(Nv)
            degrees_GT = list(v2dv_GT.values())
            cns_GT = [len(v2Nv_GT[u] & v2Nv_GT[v]) for u, v in iter_edges(G_i)]
            acc_GT = nx.average_clustering(G_i)
            print('GT acc =', acc_GT)
            print('GT acc =', acc_GT, file=f_out)
            for method in methods:
                p_experiments_method = p_experiments / method
                for random_seed in [1, 2, 3] if method == 'PEAR' else [42]:
                    p_Gi = p_experiments_method / f'{graph_name}-seed{random_seed}.G_{i_layer}' if method == 'PEAR' \
                        else p_experiments_method / f'{graph_name}.G_{i_layer}'
                    with open(p_Gi, 'rb') as f:
                        G_i_generated: nx.Graph = pickle.load(f)
                    v2Nv = dict()
                    v2dv = dict()
                    for v in G_i_generated:
                        Nv = set(G_i_generated[v])
                        v2Nv[v] = Nv
                        v2dv[v] = len(Nv)
                    ks_stat, _ = ks_2samp(degrees_GT, list(v2dv.values()))
                    print('KSND', f'seed {random_seed}', ks_stat)
                    print('KSND', f'seed {random_seed}', ks_stat, file=f_out)

                    cns = [len(v2Nv[u] & v2Nv[v]) for u, v in iter_edges(G_i_generated)]
                    ks_stat, _ = ks_2samp(cns_GT, cns)
                    print('KSCN', f'seed {random_seed}', ks_stat)
                    print('KSCN', f'seed {random_seed}', ks_stat, file=f_out)

                    acc = nx.average_clustering(G_i_generated)
                    print('DACC', f'seed {random_seed}', abs(acc - acc_GT))
                    print('DACC', f'seed {random_seed}', abs(acc - acc_GT), file=f_out)

                    if graph_name == 'sx-stackoverflow':
                        continue
                    dist_obj = NetSimile()
                    distance = dist_obj.dist(G_i_generated, G_i)
                    print(f'NetSimile', f'seed {random_seed}', distance)
                    print(f'NetSimile', f'seed {random_seed}', distance, file=f_out)
