## Training with LFR and testing over real networks 2

This notebook contains some code that can be used for performing the experiments showed on section 3.3.2 in which the modesl are trained using LFR networks and tested over real networks

A set of 5 LFR, 5 SF, 50 LFR and 50 SF graph sets are considered

In [None]:
from networkit import *
import pickle
import networkx as nx
from networkx.generators.community import LFR_benchmark_graph
from datetime import datetime

In [None]:
def nx2nkit(g_nx):
    
    node_num = g_nx.number_of_nodes()
    g_nkit = Graph(directed=True)
    
    for i in range(node_num):
        g_nkit.addNode()
    
    for e1,e2 in g_nx.edges():
        g_nkit.addEdge(e1,e2)
        
    assert g_nx.number_of_nodes()==g_nkit.numberOfNodes(),"Number of nodes not matching"
    assert g_nx.number_of_edges()==g_nkit.numberOfEdges(),"Number of edges not matching"
        
    return g_nkit


def cal_exact_bet(g_nkit):

    exact_bet = centrality.Betweenness(g_nkit,normalized=True).run().ranking()
    exact_bet_dict = dict()
    for j in exact_bet:
        exact_bet_dict[j[0]] = j[1]
    return exact_bet_dict


def cal_exact_degree(g_nkit):

    exact_deg = centrality.DegreeCentrality(g_nkit,normalized=False).run().ranking()
    exact_deg_dict = dict()
    for j in exact_deg:
        exact_deg_dict[j[0]] = j[1]
    return exact_deg_dict

    
def generate_bet_LFR_data(num_nodes, num_of_graphs,output_path):
    
    list_bet_data = list()

    for i in range(num_of_graphs):
        
        while True:
            try:
                print(f"Graph index:{i+1}/{num_of_graphs}, Time: {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")
                g_nx = LFR_benchmark_graph(n=num_nodes,tau1=3,tau2=1.5,mu=0.05,average_degree=6,min_community=20)
            except:
                continue
            else:
                break
        print("removing isolates")
        
        if nx.number_of_isolates(g_nx)>0:
            g_nx.remove_nodes_from(list(nx.isolates(g_nx)))
        
        g_nx = nx.convert_node_labels_to_integers(g_nx)
        g_nkit = nx2nkit(g_nx)
        bet_dict = cal_exact_bet(g_nkit)
        deg_dict = cal_exact_degree(g_nkit)
        list_bet_data.append([g_nx,bet_dict,deg_dict])

        with open(output_path,"wb") as fopen:
            pickle.dump(list_bet_data,fopen)


### The sets of 5 and 50 LFR are created

In [None]:
num_of_graphs = 5
num_nodes = 10000
output_path = f"graphs/LFR_{num_of_graphs}_graphs_{num_nodes}_nodes.pickle"

generate_bet_LFR_data(num_nodes, num_of_graphs,output_path)

num_of_graphs = 50
num_nodes = 10000
output_path = f"graphs/LFR_{num_of_graphs}_graphs_{num_nodes}_nodes.pickle"

generate_bet_LFR_data(num_nodes, num_of_graphs,output_path)

In [1]:
import sys
sys.path.append("./functions")

from utils import *
from model_bet import *
import pandas as pd
import os
import time

### The sets of 5 and 50 SF are created

In [None]:
random.seed(10)

param = {
    "min_nodes": 10000,
    "max_nodes": 10001,
    "num_of_graphs": 5,
    "graph_types": ["SF"]
}

for graph_type in param["graph_types"]:

    list_bet_data = list()
    print("Generating graphs and calculating centralities...")
    for i in range(param['num_of_graphs']):
        print(f"{datetime.now().strftime('%d/%m/%Y %H:%M:%S')}: Graph index:{i+1}/{param['num_of_graphs']}")
        g_nx = create_graph(graph_type,param['min_nodes'],param['max_nodes'])
        
        if nx.number_of_isolates(g_nx)>0:
            #print("Graph has isolates.")
            g_nx.remove_nodes_from(list(nx.isolates(g_nx)))
        
        g_nx = nx.convert_node_labels_to_integers(g_nx)
        
        g_nkit = nx2nkit(g_nx)
        bet_dict = cal_exact_bet(g_nkit)
        deg_dict = cal_exact_degree(g_nkit)
        list_bet_data.append([g_nx,bet_dict,deg_dict])

    fname_bet = f"./graphs/{graph_type}_{param['num_of_graphs']}_graphs_{param['min_nodes']}_nodes.pickle"    

    with open(fname_bet,"wb") as fopen:
        pickle.dump(list_bet_data,fopen)

print("")
print("Graphs saved")

In [None]:
random.seed(10)

param = {
    "min_nodes": 10000,
    "max_nodes": 10001,
    "num_of_graphs": 50,
    "graph_types": ["SF"]
}

for graph_type in param["graph_types"]:

    list_bet_data = list()
    print("Generating graphs and calculating centralities...")
    for i in range(param['num_of_graphs']):
        print(f"{datetime.now().strftime('%d/%m/%Y %H:%M:%S')}: Graph index:{i+1}/{param['num_of_graphs']}")
        g_nx = create_graph(graph_type,param['min_nodes'],param['max_nodes'])
        
        if nx.number_of_isolates(g_nx)>0:
            #print("Graph has isolates.")
            g_nx.remove_nodes_from(list(nx.isolates(g_nx)))
        
        g_nx = nx.convert_node_labels_to_integers(g_nx)
        
        g_nkit = nx2nkit(g_nx)
        bet_dict = cal_exact_bet(g_nkit)
        deg_dict = cal_exact_degree(g_nkit)
        list_bet_data.append([g_nx,bet_dict,deg_dict])

    fname_bet = f"./graphs/{graph_type}_{param['num_of_graphs']}_graphs_{param['min_nodes']}_nodes.pickle"    

    with open(fname_bet,"wb") as fopen:
        pickle.dump(list_bet_data,fopen)

print("")
print("Graphs saved")

### The SF and LFR training datasets for testing over real networks are created

In [4]:
random.seed(10)

param = {
    "graph_types": ["LFR","SF"],
    "set_graphs": [5,50],
    "num_train_set_graph": [5,50],
    "nodes": 10000,
    "size" : [10000],#[10000,100000,300000,900000],
    "num_test" : 0,
    "num_copies": [1]
}

for graph_type in param["graph_types"]:
    for idx,set_graph in enumerate(param["set_graphs"]):

        with open(f"./graphs/{graph_type}_{set_graph}_graphs_{param['nodes']}_nodes.pickle","rb") as fopen:
            list_data = pickle.load(fopen)

        num_graph = len(list_data)
        num_train = param["num_train_set_graph"][idx]
        assert num_train+param["num_test"] == num_graph,"Required split size doesn't match number of graphs in pickle file."

        for size in param["size"]:
            for c in param["num_copies"]:
                #For training split
                if num_train > 0:
                    list_graph, list_n_sequence, list_node_num, cent_mat, deg_mat = create_dataset(list_data[:num_train],num_copies = c,adj_size=size)

                    with open(f"./data_splits/train/{graph_type}_{num_train}_graphs_{param['nodes']}_nodes_{c}_copies_{size}_size.pickle","wb") as fopen:
                        pickle.dump([list_graph,list_n_sequence,list_node_num,cent_mat, deg_mat],fopen)

### Training and saving models with the generated datasets

In [None]:

param = {
    "graph_types": ["LFR","SF"],
    "num_train_graphs": [5,50],
    "nodes": 10000,
    "size" : [10000],#[10000,100000,300000,900000],
    "num_copies": [1],#10,20,40],
    "model_seeds": [j for j in range(15)],
    "num_epochs": 10
}

for graph_type in param["graph_types"]:
    for num_train_graphs in param["num_train_graphs"]:
        for size in param["size"]:
            for c in param["num_copies"]:

                data_train = f"{graph_type}_{num_train_graphs}_graphs_{param['nodes']}_nodes_{c}_copies_{size}_size.pickle"    

                #Load training data
                print(f"Loading data...")
                with open("./data_splits/train/"+data_train,"rb") as fopen:
                    list_graph_train,list_n_seq_train,list_num_node_train,bc_mat_train, deg_mat_train = pickle.load(fopen)

                list_adj_train,list_adj_t_train = graph_to_adj_bet(list_graph_train,list_n_seq_train,list_num_node_train,size)

                for seed in param["model_seeds"]:

                    #Model parameters
                    hidden = 20
                    
                    torch.manual_seed(seed)

                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                    model = GNN_Bet(ninput=size,nhid=hidden,dropout=0.6)
                    model.to(device)

                    optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
                    num_epoch = param["num_epochs"]

                    for e in range(num_epoch):
                        print(f"{graph_type}_{num_train_graphs}_{c}_copies_{size}_size_{e}_epoch_{seed}_seed_{datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")
                        train(list_adj_train,list_adj_t_train,list_num_node_train,bc_mat_train,model,device,optimizer,size)
                        
                        saving_path = f"./models/{graph_type}/{graph_type}_{num_train_graphs}_graphs_{param['nodes']}_nodes_{c}_copies_{size}_size_{seed}_seed_{e}_epoch"
                        torch.save(model.state_dict(), saving_path)

### Reading the generated models and testing over real networks when using 5 training SF and LFR graphs

In [None]:
param = {
    "graphs": ['1-wiki-Vote'],#'2-soc-Epinions'],
    "sizes" : [10000],#100000],
    "train_graph_nodes": 10000,
    "copies": [1],#10,20,40],
    "seeds": [j for j in range(15)],
    "epochs": [j for j in range(10)]
}



Results = {j:{'LFR': {}, 'SF': {}} for j in param["graphs"]}

for i in range(len(param["graphs"])):

    g = param["graphs"][i]
    size = param["sizes"][i]
    data_test = f'{g}_{size}_size.pickle'

    #Load test data
    with open("./data_splits/test/"+data_test,"rb") as fopen:
        list_graph_test,list_n_seq_test,list_num_node_test,bc_mat_test,deg_mat_test = pickle.load(fopen)

    list_adj_test,list_adj_t_test = graph_to_adj_bet(list_graph_test,list_n_seq_test,list_num_node_test,size)

    Results[g]['LFR'][f"{size}_size"] = {'test_graph': data_test, 'real': [], 'deg': []}
    Results[g]['SF'][f"{size}_size"] = {'test_graph': data_test, 'real': [], 'deg': []}

    for c in param["copies"]:
    
        LFR_data_train = f"LFR_5_graphs_{param['train_graph_nodes']}_nodes_{c}_copies_{size}_size.pickle"
        SF_data_train = f"SF_5_graphs_{param['train_graph_nodes']}_nodes_{c}_copies_{size}_size.pickle"
        
        Results[g]['LFR'][f"{size}_size"][f"{c}_copies"] = {'data_train' : LFR_data_train,'pred':{}}
        Results[g]['SF'][f"{size}_size"][f"{c}_copies"] = {'data_train' : SF_data_train,'pred':{}}
        
        for epoch in param["epochs"]:
            
            Results[g]['LFR'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'] = {}
            Results[g]['SF'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'] = {}
        
            for seed in param["seeds"]:
                    
                    data_train = LFR_data_train
                    model_path = f'{data_train[:-7]}_{seed}_seed_{epoch}_epoch'
                    print(model_path)
                    hidden = 20
                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                    model = GNN_Bet(ninput=size,nhid=hidden,dropout=0.6)
                    model.load_state_dict(torch.load(f'models/LFR/{model_path}'))
                    model.to(device)
                    optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
                    with torch.no_grad():
                        r = test_onegraph(list_adj_test,list_adj_t_test,list_num_node_test,bc_mat_test,deg_mat_test,model=model,device=device,size=size)

                    Results[g]['LFR'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'][f'{seed}_seed'] = {'pred':r['pred'],'kt':r['kt']}

                    if len(Results[g]['LFR'][f"{size}_size"]['real']) == 0:
                        Results[g]['LFR'][f"{size}_size"]['real'] = r['true']
                        Results[g]['LFR'][f"{size}_size"]['deg'] = r['deg']

                    with open(f"./outputs/LFR_SF_real_performance_5_traingraphs_{len(param['epochs'])}_epochs_{len(param['seeds'])}_seeds.pickle","wb") as fopen:
                        pickle.dump(Results,fopen)

                    data_train = SF_data_train
                    model_path = f'{data_train[:-7]}_{seed}_seed_{epoch}_epoch'
                    print(model_path)
                    hidden = 20
                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                    model = GNN_Bet(ninput=size,nhid=hidden,dropout=0.6)
                    model.load_state_dict(torch.load(f'models/SF/{model_path}'))
                    model.to(device)
                    optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
                    with torch.no_grad():
                        r = test_onegraph(list_adj_test,list_adj_t_test,list_num_node_test,bc_mat_test,deg_mat_test,model=model,device=device,size=size)

                    Results[g]['SF'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'][f'{seed}_seed'] = {'pred':r['pred'],'kt':r['kt']}

                    if len(Results[g]['SF'][f"{size}_size"]['real']) == 0:
                        Results[g]['SF'][f"{size}_size"]['real'] = r['true']
                        Results[g]['SF'][f"{size}_size"]['deg'] = r['deg']

                    with open(f"./outputs/LFR_SF_real_performance_5_traingraphs_{len(param['epochs'])}_epochs_{len(param['seeds'])}_seeds.pickle","wb") as fopen:
                        pickle.dump(Results,fopen)



### Reading the generated models and testing over real networks when using 50 training SF and LFR graphs

In [None]:
param = {
    "graphs": ['1-wiki-Vote'],#'2-soc-Epinions'],
    "sizes" : [10000],#100000],
    "train_graph_nodes": 10000,
    "copies": [1],#10,20,40],
    "seeds": [j for j in range(15)],
    "epochs": [j for j in range(10)]
}



Results = {j:{'LFR': {}, 'SF': {}} for j in param["graphs"]}

for i in range(len(param["graphs"])):

    g = param["graphs"][i]
    size = param["sizes"][i]
    data_test = f'{g}_{size}_size.pickle'

    #Load test data
    with open("./data_splits/test/"+data_test,"rb") as fopen:
        list_graph_test,list_n_seq_test,list_num_node_test,bc_mat_test,deg_mat_test = pickle.load(fopen)

    list_adj_test,list_adj_t_test = graph_to_adj_bet(list_graph_test,list_n_seq_test,list_num_node_test,size)

    Results[g]['LFR'][f"{size}_size"] = {'test_graph': data_test, 'real': [], 'deg': []}
    Results[g]['SF'][f"{size}_size"] = {'test_graph': data_test, 'real': [], 'deg': []}

    for c in param["copies"]:
    
        LFR_data_train = f"LFR_50_graphs_{param['train_graph_nodes']}_nodes_{c}_copies_{size}_size.pickle"
        SF_data_train = f"SF_50_graphs_{param['train_graph_nodes']}_nodes_{c}_copies_{size}_size.pickle"
        
        Results[g]['LFR'][f"{size}_size"][f"{c}_copies"] = {'data_train' : LFR_data_train,'pred':{}}
        Results[g]['SF'][f"{size}_size"][f"{c}_copies"] = {'data_train' : SF_data_train,'pred':{}}
        
        for epoch in param["epochs"]:
            
            Results[g]['LFR'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'] = {}
            Results[g]['SF'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'] = {}
        
            for seed in param["seeds"]:
                    
                    data_train = LFR_data_train
                    model_path = f'{data_train[:-7]}_{seed}_seed_{epoch}_epoch'
                    print(model_path)
                    hidden = 20
                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                    model = GNN_Bet(ninput=size,nhid=hidden,dropout=0.6)
                    model.load_state_dict(torch.load(f'models/LFR/{model_path}'))
                    model.to(device)
                    optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
                    with torch.no_grad():
                        r = test_onegraph(list_adj_test,list_adj_t_test,list_num_node_test,bc_mat_test,deg_mat_test,model=model,device=device,size=size)

                    Results[g]['LFR'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'][f'{seed}_seed'] = {'pred':r['pred'],'kt':r['kt']}

                    if len(Results[g]['LFR'][f"{size}_size"]['real']) == 0:
                        Results[g]['LFR'][f"{size}_size"]['real'] = r['true']
                        Results[g]['LFR'][f"{size}_size"]['deg'] = r['deg']

                    with open(f"./outputs/LFR_SF_real_performance_50_traingraphs_{len(param['epochs'])}_epochs_{len(param['seeds'])}_seeds.pickle","wb") as fopen:
                        pickle.dump(Results,fopen)

                    data_train = SF_data_train
                    model_path = f'{data_train[:-7]}_{seed}_seed_{epoch}_epoch'
                    print(model_path)
                    hidden = 20
                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                    model = GNN_Bet(ninput=size,nhid=hidden,dropout=0.6)
                    model.load_state_dict(torch.load(f'models/SF/{model_path}'))
                    model.to(device)
                    optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
                    with torch.no_grad():
                        r = test_onegraph(list_adj_test,list_adj_t_test,list_num_node_test,bc_mat_test,deg_mat_test,model=model,device=device,size=size)

                    Results[g]['SF'][f"{size}_size"][f"{c}_copies"]['pred'][f'{epoch}_epoch'][f'{seed}_seed'] = {'pred':r['pred'],'kt':r['kt']}

                    if len(Results[g]['SF'][f"{size}_size"]['real']) == 0:
                        Results[g]['SF'][f"{size}_size"]['real'] = r['true']
                        Results[g]['SF'][f"{size}_size"]['deg'] = r['deg']

                    with open(f"./outputs/LFR_SF_real_performance_50_traingraphs_{len(param['epochs'])}_epochs_{len(param['seeds'])}_seeds.pickle","wb") as fopen:
                        pickle.dump(Results,fopen)

