In [1]:
import sys
sys.path.append("./functions")

from utils import *
from model_bet import *
from layer import *

### Reading real graphs and computing the betweenness centrality

In [2]:
def read_compute_target(in_path,out_path):
    
    #print("Computing Bet")
    list_bet_data = list()
    g_nx = nx.read_edgelist(in_path, comments='#', nodetype=int)
    if nx.number_of_isolates(g_nx)>0:
        #print("Graph has isolates.")
        g_nx.remove_nodes_from(list(nx.isolates(g_nx)))

    g_nx = nx.convert_node_labels_to_integers(g_nx)
    g_nkit = nx2nkit(g_nx)
    bet_dict = cal_exact_bet(g_nkit)
    deg_dict = cal_exact_degree(g_nkit)
    list_bet_data.append([g_nx,bet_dict,deg_dict])
    
    with open(out_path,"wb") as fopen:
        pickle.dump(list_bet_data,fopen)
    
    print(f"Bet computed and graph saved: {out_path}")


networks = ['1-wiki-Vote','2-soc-Epinions','3-email-EuAll','4-web-Google']

for net in networks:
    in_path_graph = f"./real_graphs/original/{net}.txt"
    out_path_graph = f"./real_graphs/bet_real_graphs/{net}.pickle"
    read_compute_target(in_path_graph,out_path_graph)

Bet computed and graph saved: ./real_graphs/bet_real_graphs/1-wiki-Vote.pickle


### We create the datasets for the real graphs

In [2]:
param = {
    "adj_size" : [10000,100000,300000,900000],
    "graph_files": ['1-wiki-Vote','2-soc-Epinions','3-email-EuAll','4-web-Google'],
    "split_seeds": [10],

    "num_train" : 0,
    "num_test" : 1,
    "num_copies": [1],
}


for idx,file in enumerate(param["graph_files"]):
    for splitseed in param["split_seeds"]:
        adj_size = param["adj_size"][idx]
        random.seed(splitseed)

        with open(f"./real_graphs/bet_real_graphs/{file}.pickle","rb") as fopen:
            list_data = pickle.load(fopen)

        num_graph = len(list_data)
        assert param["num_train"]+param["num_test"] == num_graph,"Required split size doesn't match number of graphs in pickle file."

        #For test split
        if param["num_test"] > 0:
            list_graph, list_n_sequence, list_node_num, cent_mat, deg_mat = create_dataset(list_data[param["num_train"]:param["num_train"]+param["num_test"]],num_copies = 1,adj_size=adj_size)

            with open(f"./data_splits/test/{file}_{param['num_test']}_test_{adj_size}_size_{splitseed}_splitseed.pickle","wb") as fopen:
                pickle.dump([list_graph,list_n_sequence,list_node_num,cent_mat, deg_mat],fopen)

### Generating scale-free graphs for training

In [3]:
param = {
    "nodes": [10,100,1000,10000],
    "num_of_graphs": 5,
    "graph_types": ["SF"],
    "generation_seeds": [10]
}


for graph_type in param["graph_types"]:
    for  nodes in param["nodes"]:
        for seed in param["generation_seeds"]:

            random.seed(seed)

            print(f"Generating {param['num_of_graphs']} {graph_type} graphs of {nodes} nodes")
            list_bet_data = list()
            for i in range(param['num_of_graphs']):
                print(f"{datetime.now().strftime('%d/%m/%Y %H:%M:%S')}: Graph index:{i+1}/{param['num_of_graphs']}")
                g_nx = create_graph(graph_type,nodes,nodes+1)
                
                if nx.number_of_isolates(g_nx)>0:
                    g_nx.remove_nodes_from(list(nx.isolates(g_nx)))
                    g_nx = nx.convert_node_labels_to_integers(g_nx)

                g_nkit = nx2nkit(g_nx)
                bet_dict = cal_exact_bet(g_nkit)
                deg_dict = cal_exact_degree(g_nkit)
                list_bet_data.append([g_nx,bet_dict,deg_dict])

            fname_bet = f"./graphs/{graph_type}_{param['num_of_graphs']}_graphs_{nodes}_nodes_{seed}_genseed.pickle"    

            with open(fname_bet,"wb") as fopen:
                pickle.dump(list_bet_data,fopen)

print("Graphs saved")

Generating 5 SF graphs of 10 nodes
10/05/2023 19:41:49: Graph index:1/5
10/05/2023 19:41:49: Graph index:2/5
10/05/2023 19:41:49: Graph index:3/5
10/05/2023 19:41:49: Graph index:4/5
10/05/2023 19:41:49: Graph index:5/5
Generating 5 SF graphs of 100 nodes
10/05/2023 19:41:49: Graph index:1/5
10/05/2023 19:41:49: Graph index:2/5
10/05/2023 19:41:49: Graph index:3/5
10/05/2023 19:41:49: Graph index:4/5
10/05/2023 19:41:49: Graph index:5/5
Generating 5 SF graphs of 1000 nodes
10/05/2023 19:41:49: Graph index:1/5
10/05/2023 19:41:49: Graph index:2/5
10/05/2023 19:41:50: Graph index:3/5
10/05/2023 19:41:50: Graph index:4/5
10/05/2023 19:41:50: Graph index:5/5
Generating 5 SF graphs of 10000 nodes
10/05/2023 19:41:51: Graph index:1/5


RuntimeError: Received CTRL+C/SIGINT

### Generating the different training splits using the SF graphs

In [4]:
param = {
    "nodes": [10,100,1000,10000],
    "adj_size" : [10000,100000,300000,900000],
    "num_train" : 5,
    "num_test" : 0,
    "num_copies": [1,10,20,40],
    "split_seeds": [0],
    "generation_seeds": [10]
}


for size in param["adj_size"]:
    for nodes in param["nodes"]:
        for num_copies in param["num_copies"]:
            for genseed in param["generation_seeds"]:
                for splitseed in param["split_seeds"]:
                    random.seed(splitseed)

                    with open(f"./graphs/SF_5_graphs_{nodes}_nodes_{genseed}_genseed.pickle","rb") as fopen:
                        list_data = pickle.load(fopen)

                    num_graph = len(list_data)
                    assert param["num_train"]+param["num_test"] == num_graph,"Required split size doesn't match number of graphs in pickle file."
                
                    #For training split
                    if param["num_train"] > 0:
                        list_graph, list_n_sequence, list_node_num, cent_mat, deg_mat = create_dataset(list_data[:param["num_train"]],num_copies = num_copies, adj_size=size)

                        with open(f"./data_splits/train/SF_5_graphs_{nodes}_nodes_{genseed}_genseed_{param['num_train']}_train_{num_copies}_copies_{size}_size_{splitseed}_splitseed.pickle","wb") as fopen:
                            pickle.dump([list_graph,list_n_sequence,list_node_num,cent_mat, deg_mat],fopen)


FileNotFoundError: [Errno 2] No such file or directory: './graphs/SF_5_graphs_10000_nodes_10_genseed.pickle'

### Train  and test using the different graph sets created

In [3]:
 
import pandas as pd

param = {
    
    "adj_size" : [10000,100000,300000,900000],
    "graph_files": ['1-wiki-Vote','2-soc-Epinions','3-email-EuAll','4-web-Google'],
    "nodes": [10,100,1000,10000],
    "num_train" : 5,
    "num_test" : 0,
    "num_copies": [1,10,20,40],
    "generation_seeds": [10],
    "split_seeds": [0],
    "test_split_seed": 10,
    "model_seeds": [15],
    "num_epochs": 15,
}


Results = { 
           "test_graph":[],
           "adj_size": [],
           "train_file": [],
           "nodes": [],
           "copies":[],
           "generation_seed":[],
           "splilt_seed": [],
           "model_seed": [],
           "epochs": [],
           "kendalltau":[],
           "std":[]}


for idx,graph_file in enumerate(param["graph_files"]):
    adj_size = param["adj_size"][idx]

    test_file = f"{graph_file}_1_test_{adj_size}_size_{param['test_split_seed']}_splitseed.pickle"
    #Load test data
    with open("./data_splits/test/"+test_file,"rb") as fopen:
        list_graph_test,list_n_seq_test,list_num_node_test,bc_mat_test,deg_mat_test = pickle.load(fopen)

    list_adj_test,list_adj_t_test = graph_to_adj_bet(list_graph_test,list_n_seq_test,list_num_node_test,adj_size)

    for nodes in param["nodes"]:
        for num_copies in param["num_copies"]:
            for genseed in param["generation_seeds"]:
                for splitseed in param["split_seeds"]:  

                    train_file = f"SF_5_graphs_{nodes}_nodes_{genseed}_genseed_{param['num_train']}_train_{num_copies}_copies_{adj_size}_size_{splitseed}_splitseed.pickle"
                    #Load training data
                    print(f"Loading data...")
                    with open("./data_splits/train/"+train_file,"rb") as fopen:
                        list_graph_train,list_n_seq_train,list_num_node_train,bc_mat_train,deg_mat_train = pickle.load(fopen)

                    model_size = bc_mat_train.shape[0]
                    assert model_size == adj_size
                    
                    list_adj_train,list_adj_t_train = graph_to_adj_bet(list_graph_train,list_n_seq_train,list_num_node_train,adj_size)
                    
                    for model_seed in param["model_seeds"]:
                        #Model parameters

                        torch.manual_seed(model_seed)
                        hidden = 20
                        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                        model = GNN_Bet(ninput=model_size,nhid=hidden,dropout=0.6)
                        model.to(device)

                        optimizer = torch.optim.Adam(model.parameters(),lr=0.0005)
                        num_epoch = param["num_epochs"]

                        print(f"Training, total Number of epoches: {num_epoch}")
                        for e in range(num_epoch):
                            print(f"Epoch number: {e+1}/{num_epoch}")
                            train(list_adj_train,list_adj_t_train,list_num_node_train,bc_mat_train,model,device,optimizer,model_size)

                            #to check test loss while training
                            with torch.no_grad():
                                r = test(list_adj_test,list_adj_t_test,list_num_node_test,bc_mat_test,deg_mat_test,model,device,model_size)

                            Results["test_graph"].append(graph_file)
                            Results["adj_size"].append(adj_size)
                            Results["train_file"].append(train_file)
                            Results["nodes"].append(nodes)
                            Results["copies"].append(num_copies)
                            Results["generation_seed"].append(genseed)
                            Results["splilt_seed"].append(splitseed)
                            Results["model_seed"].append(model_seed)
                            Results["epochs"].append(e)
                            Results["kendalltau"].append(r["kt"])
                            Results["std"].append(r["std"])

                            df = pd.DataFrame.from_dict(Results)

                            df.to_csv("./outputs/SF_real_graphs_performance.csv")


Processing 1 graphs...
Loading data...
Processing 5 graphs...
Training, total Number of epoches: 15
Epoch number: 1/15
   Average KT score on test graphs is: 0.8951491867235485 and std: 0.0
Epoch number: 2/15
   Average KT score on test graphs is: 0.9128888595587135 and std: 0.0
Epoch number: 3/15
   Average KT score on test graphs is: 0.9211252073051766 and std: 0.0
Epoch number: 4/15
   Average KT score on test graphs is: 0.9253811169863474 and std: 0.0
Epoch number: 5/15
   Average KT score on test graphs is: 0.927627830928001 and std: 0.0
Epoch number: 6/15
   Average KT score on test graphs is: 0.9289317889725495 and std: 0.0
Epoch number: 7/15
   Average KT score on test graphs is: 0.9293916287146002 and std: 0.0
Epoch number: 8/15
   Average KT score on test graphs is: 0.92890125493827 and std: 0.0
Epoch number: 9/15
   Average KT score on test graphs is: 0.9284797912879172 and std: 0.0
Epoch number: 10/15
   Average KT score on test graphs is: 0.9281588698467585 and std: 0.0
Ep

KeyboardInterrupt: 