In [1]:
from networkit import *
import pickle
import networkx as nx
from networkx.generators.community import LFR_benchmark_graph
from datetime import datetime

In [2]:
def nx2nkit(g_nx):
    
    node_num = g_nx.number_of_nodes()
    g_nkit = Graph(directed=True)
    
    for i in range(node_num):
        g_nkit.addNode()
    
    for e1,e2 in g_nx.edges():
        g_nkit.addEdge(e1,e2)
        
    assert g_nx.number_of_nodes()==g_nkit.numberOfNodes(),"Number of nodes not matching"
    assert g_nx.number_of_edges()==g_nkit.numberOfEdges(),"Number of edges not matching"
        
    return g_nkit


def cal_exact_bet(g_nkit):

    exact_bet = centrality.Betweenness(g_nkit,normalized=True).run().ranking()
    exact_bet_dict = dict()
    for j in exact_bet:
        exact_bet_dict[j[0]] = j[1]
    return exact_bet_dict


def cal_exact_degree(g_nkit):

    exact_deg = centrality.DegreeCentrality(g_nkit,normalized=False).run().ranking()
    exact_deg_dict = dict()
    for j in exact_deg:
        exact_deg_dict[j[0]] = j[1]
    return exact_deg_dict

    
def generate_bet_LFR_data(num_of_graphs,output_path):
    
    list_bet_data = list()

    for i in range(num_of_graphs):
        
        while True:
            try:
                print(f"Graph index:{i+1}/{num_of_graphs}, Time: {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")
                g_nx = LFR_benchmark_graph(n=10000,tau1=3,tau2=1.5,mu=0.05,average_degree=6,min_community=20)
            except:
                continue
            else:
                break
        print("removing isolates")
        
        if nx.number_of_isolates(g_nx)>0:
            g_nx.remove_nodes_from(list(nx.isolates(g_nx)))
        
        g_nx = nx.convert_node_labels_to_integers(g_nx)
        g_nkit = nx2nkit(g_nx)
        bet_dict = cal_exact_bet(g_nkit)
        deg_dict = cal_exact_degree(g_nkit)
        list_bet_data.append([g_nx,bet_dict,deg_dict])

        with open(output_path,"wb") as fopen:
            pickle.dump(list_bet_data,fopen)


### A set of 15 LFR graphs is created

In [3]:
num_of_graphs = 15

output_path = f"graphs/LFR_{num_of_graphs}_graphs_10000_nodes.pickle"

generate_bet_LFR_data(num_of_graphs,output_path)

Graph index:1/15, Time: 11/05/2023 18:25:43
removing isolates
Graph index:2/15, Time: 11/05/2023 18:25:43
removing isolates
Graph index:3/15, Time: 11/05/2023 18:25:43
removing isolates
Graph index:4/15, Time: 11/05/2023 18:25:44
removing isolates
Graph index:5/15, Time: 11/05/2023 18:25:44
removing isolates
Graph index:6/15, Time: 11/05/2023 18:25:44
Graph index:6/15, Time: 11/05/2023 18:25:49
Graph index:6/15, Time: 11/05/2023 18:25:54
removing isolates
Graph index:7/15, Time: 11/05/2023 18:25:54
removing isolates
Graph index:8/15, Time: 11/05/2023 18:25:55
Graph index:8/15, Time: 11/05/2023 18:25:59
removing isolates
Graph index:9/15, Time: 11/05/2023 18:26:00
removing isolates
Graph index:10/15, Time: 11/05/2023 18:26:00
Graph index:10/15, Time: 11/05/2023 18:26:05
removing isolates
Graph index:11/15, Time: 11/05/2023 18:26:05
Graph index:11/15, Time: 11/05/2023 18:26:10
removing isolates
Graph index:12/15, Time: 11/05/2023 18:26:10
removing isolates
Graph index:13/15, Time: 11/05/

### Restart the environment and now we use the original defined functions
### We create  datasets for testing LFR synthetic performance

In [1]:
import sys
sys.path.append("./functions")

from utils import *
from model_bet import *
import pandas as pd
import os
import time

In [2]:
random.seed(10)

param = {
    "size" : [10000],
    "num_train" : 5,
    "num_test" : 10,
    "num_copies": [1,10,20,40]
}


with open(f"./graphs/LFR_15_graphs_10000_nodes.pickle","rb") as fopen:
    list_data = pickle.load(fopen)

num_graph = len(list_data)
assert param["num_train"]+param["num_test"] == num_graph,"Required split size doesn't match number of graphs in pickle file."

for size in param["size"]:
    for c in param["num_copies"]:

        #For training split
        if param["num_train"] > 0:
            list_graph, list_n_sequence, list_node_num, cent_mat, deg_mat = create_dataset(list_data[:param["num_train"]],num_copies = c,adj_size=size)

            with open(f"./data_splits/train/LFR_5_graphs_{c}_copies_{size}_size.pickle","wb") as fopen:
                pickle.dump([list_graph,list_n_sequence,list_node_num,cent_mat, deg_mat],fopen)

    #For test split
    size = param["size"][0]
    if param["num_test"] > 0:
        list_graph, list_n_sequence, list_node_num, cent_mat, deg_mat = create_dataset(list_data[param["num_train"]:param["num_train"]+param["num_test"]],num_copies = 1,adj_size=size)

        with open(f"./data_splits/test/LFR_10_graphs_{size}_size.pickle","wb") as fopen:
            pickle.dump([list_graph,list_n_sequence,list_node_num,cent_mat, deg_mat],fopen)

### training and saving LFR models

In [4]:

param = {
    "size" : [10000],
    "num_copies": [1,10,20,40],
    "model_seed": 15,
    "num_epochs": 15
}

for size in param["size"]:
    for c in param["num_copies"]:

        data_train = f"LFR_5_graphs_{c}_copies_{size}_size.pickle"    

        #Load training data
        print(f"Loading data...")
        with open("./data_splits/train/"+data_train,"rb") as fopen:
            list_graph_train,list_n_seq_train,list_num_node_train,bc_mat_train, deg_mat_train = pickle.load(fopen)

        list_adj_train,list_adj_t_train = graph_to_adj_bet(list_graph_train,list_n_seq_train,list_num_node_train,size)

        #Model parameters
        hidden = 20
        
        torch.manual_seed(param["model_seed"])

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = GNN_Bet(ninput=size,nhid=hidden,dropout=0.6)
        model.to(device)

        optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
        num_epoch = param["num_epochs"]

        for e in range(num_epoch):
            print(f"{c}_copies_{size}_size_{e}_epoch_{datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")
            train(list_adj_train,list_adj_t_train,list_num_node_train,bc_mat_train,model,device,optimizer,size)
            
            saving_path = f"./models/LFR/LFR_5_graphs_{c}_copies_{size}_size_{e}_epoch"
            torch.save(model.state_dict(), saving_path)

Loading data...
Processing 5 graphs...
1_copies_1000_size_0_epoch_11/05/2023 18:46:38
1_copies_1000_size_1_epoch_11/05/2023 18:46:38
1_copies_1000_size_2_epoch_11/05/2023 18:46:39
1_copies_1000_size_3_epoch_11/05/2023 18:46:39
1_copies_1000_size_4_epoch_11/05/2023 18:46:39
1_copies_1000_size_5_epoch_11/05/2023 18:46:39
1_copies_1000_size_6_epoch_11/05/2023 18:46:39
1_copies_1000_size_7_epoch_11/05/2023 18:46:39
1_copies_1000_size_8_epoch_11/05/2023 18:46:39
1_copies_1000_size_9_epoch_11/05/2023 18:46:40
1_copies_1000_size_10_epoch_11/05/2023 18:46:40
1_copies_1000_size_11_epoch_11/05/2023 18:46:40
1_copies_1000_size_12_epoch_11/05/2023 18:46:40
1_copies_1000_size_13_epoch_11/05/2023 18:46:40
1_copies_1000_size_14_epoch_11/05/2023 18:46:40
Loading data...
Processing 50 graphs...
10_copies_1000_size_0_epoch_11/05/2023 18:46:42
10_copies_1000_size_1_epoch_11/05/2023 18:46:43
10_copies_1000_size_2_epoch_11/05/2023 18:46:45
10_copies_1000_size_3_epoch_11/05/2023 18:46:46
10_copies_1000_size

KeyboardInterrupt: 

### the lfr performance over lfr graphs is tested

In [10]:
size = 10000

Results = {"graph":[],
            "size": [],
            "copies":[],
            "epochs": [],
            "kendalltau":[],
            "std":[]}

data_path_test = f'LFR_10_graphs_{size}_size.pickle'

#Load test data
with open("./data_splits/test/"+data_path_test,"rb") as fopen:
    list_graph_test,list_n_seq_test,list_num_node_test,bc_mat_test,deg_mat_test = pickle.load(fopen)

list_adj_test,list_adj_t_test = graph_to_adj_bet(list_graph_test,list_n_seq_test,list_num_node_test,size)

for c in [1,10,20,40]:
    for e in range(15):
        
        print(f"copies: {c}, epoch {e}")
        
        model_path = f"./models/LFR/LFR_5_graphs_{c}_copies_{size}_size_{e}_epoch"

        #Model parameters
        hidden = 20

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = GNN_Bet(ninput=size,nhid=hidden,dropout=0.6)

        model.load_state_dict(torch.load(model_path))

        model.to(device)


        with torch.no_grad():
            r = test(list_adj_test,list_adj_t_test,list_num_node_test,bc_mat_test,deg_mat_test,model,device,size)
        
        Results["graph"].append(f"LFR_10_graphs_{size}_size")
        Results["size"].append(size)
        Results["copies"].append(c)
        Results["epochs"].append(e)
        Results["kendalltau"].append(r["kt"])
        Results["std"].append(r["std"])


        df = pd.DataFrame.from_dict(Results)
        df.to_csv("outputs/output_LFR_synthetic_graphs_peformance.csv")


Processing 10 graphs...
copies: 1, epoch 0
   Average KT score on test graphs is: 0.7613041872530718 and std: 0.010222573572815108
copies: 1, epoch 1
   Average KT score on test graphs is: 0.770461461009967 and std: 0.00975405434186813
copies: 1, epoch 2
   Average KT score on test graphs is: 0.7778913929244851 and std: 0.010010685480803984
copies: 1, epoch 3
   Average KT score on test graphs is: 0.7828547949847037 and std: 0.010147059202458808
copies: 1, epoch 4
   Average KT score on test graphs is: 0.7870576300286561 and std: 0.010084159779585782
copies: 1, epoch 5
   Average KT score on test graphs is: 0.7919222450060553 and std: 0.010239235987767855
copies: 1, epoch 6
   Average KT score on test graphs is: 0.7966836206702638 and std: 0.010165061896204811
copies: 1, epoch 7
   Average KT score on test graphs is: 0.8014777542024379 and std: 0.010192258162592603
copies: 1, epoch 8
   Average KT score on test graphs is: 0.8065362333751412 and std: 0.010399032404370866
copies: 1, epoc

FileNotFoundError: [Errno 2] No such file or directory: './models/LFR/LFR_5_graphs_40_copies_1000_size_0_epoch'