In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import optuna
from sklearn.metrics import f1_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import torch
import torch_geometric.transforms as T
from torch_geometric.data import Data
from torch_geometric.loader import NeighborSampler
from torch.optim import lr_scheduler
from torch_geometric.datasets import Planetoid, WikipediaNetwork, Actor, WebKB

from modules.model import Net
from modules.sampling import SamplerContextMatrix, SamplerRandomWalk, SamplerFactorization, SamplerAPP

from catboost import CatBoostClassifier


In [2]:
analysis = pd.read_csv('../results/supervised.csv') #pd.read_csv('../classification_on_features.csv')
analysis = analysis.drop(columns='Unnamed: 0') 
analysis

Unnamed: 0,conv,dataset,train acc micro,test acc micro,train acc macro,test acc macro,label assortativity,feature assortativity,cluster coefficient,average shortest path,average degree
0,GCN,0.10.10.225,0.788873,0.125628,0.788873,0.125628,0.1,0.1,0.2,2,5
1,SAGE,0.10.10.225,0.784593,0.095477,0.784593,0.095477,0.1,0.1,0.2,2,5
2,GCN,0.10.10.2220,0.684736,0.130653,0.684736,0.130653,0.1,0.1,0.2,2,20
3,SAGE,0.10.10.2220,0.532097,0.080402,0.532097,0.080402,0.1,0.1,0.2,2,20
4,GCN,0.10.10.2240,0.850214,0.170854,0.850214,0.170854,0.1,0.1,0.2,2,40
...,...,...,...,...,...,...,...,...,...,...,...
544,GAT,0.90.90.5220,0.748930,0.748744,0.748930,0.748744,0.9,0.9,0.5,2,20
545,GAT,0.90.90.5240,0.800285,0.658291,0.800285,0.658291,0.9,0.9,0.5,2,40
546,GAT,0.90.90.535,0.699001,0.376884,0.699001,0.376884,0.9,0.9,0.5,3,5
547,GAT,0.90.90.5320,0.644793,0.180905,0.644793,0.180905,0.9,0.9,0.5,3,20


In [3]:
synthetic = True

In [4]:
benchmark_data_dir = "../data_benchmark/"
help_data = "../data_help/"

In [5]:
import random

In [6]:
if synthetic:
    datasets_names=[]
    for l_a_trgt in [0.1,0.5,0.9]:
                for f_a_trgt in [0.1,0.5,0.9]:
                    for cl_trgt in [0.01,0.1,0.2,0.3,0.5]:
                        for asp_trgt in [2,3,4,5,6,7]:
                            for a_deg_trgt in [2,5,10,15,20,25,30,35,40]:
                                datasets_names.append((l_a_trgt,f_a_trgt,cl_trgt,asp_trgt,a_deg_trgt))
    def data_load(name):
        x = torch.tensor(np.load(f'{benchmark_data_dir}/graph_'+str(name)+'_attr.npy'),dtype=torch.float)
        edge_list = torch.tensor(np.load(f'{benchmark_data_dir}/graph_'+str(name)+'_edgelist.npy')).t()
        y =  torch.tensor(np.load(f'{benchmark_data_dir}/graph_'+str(name)+'_labels.npy'))
        data=Data(x=x,edge_index=edge_list,y=y)
        indices=list(range(len(data.x)))

        train_indices = torch.tensor(indices[:int(0.7*len(indices)+1)])
        val_indices = torch.tensor(indices[int(0.7*len(indices)+1):int(0.8*len(indices)+1)])
        test_indices = torch.tensor(indices[int(0.8*len(indices)+1):])
        train_mask = torch.tensor([False]*len(indices))
        test_mask = torch.tensor([False]*len(indices))
        val_mask = torch.tensor([False]*len(indices))
        train_mask[train_indices] =True
        test_mask[test_indices]=True
        val_mask[val_indices]=True
        return data, train_indices,val_indices,test_indices,train_mask,val_mask,test_mask
else:
    datasets_names = ['Cornell','Texas','Wisconsin','Actor','Pubmed','squirrel']

    def data_load(name):
        if name == 'Cora' or name == 'Citeseer' or name == 'Pubmed':
            data = Planetoid(root='/tmp/'+str(name), name=name,transform=T.NormalizeFeatures())[0]
        elif name == 'Actor':
            data = Actor(root='/tmp/actor',transform=T.NormalizeFeatures())[0]
        elif name == "Cornell" or name=="Texas" or name=="Wisconsin":
            data = WebKB(root='/tmp/'+str(name),name=name,transform=T.NormalizeFeatures())[0]
        elif name == 'squirrel' or name=='chameleon':
            data = WikipediaNetwork(root='/tmp/'+str(name), name=name,transform=T.NormalizeFeatures())[0]

        indices=list(range(len(data.x)))

        train_indices = torch.tensor(indices[:int(0.7*len(indices)+1)])
        val_indices = torch.tensor(indices[int(0.7*len(indices)+1):int(0.8*len(indices)+1)])
        test_indices = torch.tensor(indices[int(0.8*len(indices)+1):])
        train_mask = torch.tensor([False]*len(indices))
        test_mask = torch.tensor([False]*len(indices))
        val_mask = torch.tensor([False]*len(indices))
        train_mask[train_indices] =True
        test_mask[test_indices]=True
        val_mask[val_indices]=True
        return data, train_indices,val_indices,test_indices,train_mask,val_mask,test_mask

In [11]:
import pickle
import os
import collections

class Main:
    def __init__(self,name, conv, device, loss_function, mode):
        data, train_indices, val_indices, test_indices, train_mask, val_mask, test_mask = data_load(name)
        self.Conv = conv
        self.device = device
        self.x = data.x
        self.y = data.y.squeeze()
        self.data=data.to(device)
        self.loss = loss_function
        self.mode = mode
        self.datasetname=name
        self.train_indices =train_indices# torch.tensor(indices[:int(0.7*len(indices)+1)])
        self.val_indices =val_indices# torch.tensor(indices[int(0.7*len(indices)+1):int(0.8*len(indices)+1)])
        self.test_indices = test_indices#torch.tensor(indices[int(0.8*len(indices)+1):])
        self.train_mask = train_mask#torch.tensor([False]*len(indices))
        self.test_mask = test_mask#torch.tensor([False]*len(indices))
        self.val_mask =val_mask# torch.tensor([False]*len(indices))
        super(Main, self).__init__()
    def train(self, model,data,optimizer,train_loader,dropout,epoch):
        model.train()   
        total_loss = 0
        optimizer.zero_grad()
       # print('train loader',len(train_loader))
        
        if model.mode == 'unsupervised':
            pass
        elif model.mode== 'supervised':
            if model.conv=='GCN':
                out = model.inference(data.to(self.device),dp=dropout)
                y = self.y.type(torch.LongTensor)
                y = y.to(self.device)
                loss = model.loss_sup(out[self.train_mask],y[self.train_mask])
                total_loss+=loss
            else:
                for batch_size, n_id, adjs in train_loader:
                    if len(train_loader.sizes) == 1:
                        adjs = [adjs]
                    adjs = [adj.to(self.device) for adj in adjs]
                    out = model.forward(data.x[n_id].to(self.device), adjs)
                    y = self.y.type(torch.LongTensor)
                    y = y.to(self.device)
                    loss = model.loss_sup(out,y[n_id[:batch_size]])
                    total_loss += loss
            total_loss.backward(retain_graph=True)
            optimizer.step()      
            return total_loss /len(train_loader)       

    @torch.no_grad()
    def test(self, model, data):
        model.eval()
        out = model.inference(data.to(self.device))

        y_true = self.y.cpu().detach().numpy()
        self.y=self.y.cpu()
        if model.mode == 'supervised':
            y_true = self.y.unsqueeze(-1)
            y_pred = out.cpu().argmax(dim=-1, keepdim=True)

            accs_micro = []
            accs_macro = []
            for mask in [self.train_mask,self.test_mask,self.val_mask]:
                accs_micro += [accuracy_score(self.y.detach()[mask].cpu().numpy(),y_pred[mask])]
                accs_macro += [accuracy_score(self.y.detach()[mask].cpu().numpy(),y_pred[mask])]

            return out,accs_micro,accs_macro

        elif model.mode == 'unsupervised':
              pass

    def run(self,params):

        hidden_layer=params['hidden_layer']
        out_layer=params['out_layer']
        dropout=params['dropout']
        size=params['size of network, number of convs']
        learning_rate=params['lr']
        hidden_layer_for_classifier = params['hidden_layer_for_classifier']
        number_of_layers_for_classifier = params['number_of_layers_for_classifier']
        heads = params['heads']

        #hidden_layer=64,out_layer=128,dropout=0.0,size=1,learning_rate=0.001,c=100

        train_loader = NeighborSampler(self.data.edge_index, node_idx=self.train_mask, batch_size = int(sum(self.train_mask)), sizes=[-1]*size)
        model = Net(dataset = self.data,mode=self.mode,conv=self.Conv,loss_function=self.loss,device=device,hidden_layer=hidden_layer,out_layer =out_layer,num_layers = (size),dropout = dropout,number_of_layers_for_classifier=number_of_layers_for_classifier, hidden_layer_for_classifier=hidden_layer_for_classifier,heads=heads)
        model.to(self.device)

        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay = 1e-5)
                #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.01, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08, verbose=False)
        scheduler=lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.1)
        losses=[]
        train_accs_mi=[]
        test_accs_mi=[]
        val_accs=[]
        name_of_plot='conv: '+model.conv
        train_accs_ma = []
        test_accs_ma = []
        print(name_of_plot)
        log = 'Loss: {:.4f}, Epoch: {:03d}, Train acc micro: {:.4f}, Test acc micro: {:.4f},Train acc macro: {:.4f}, Test acc macro: {:.4f}'

        for epoch in range(100):
                    print(epoch)
                    loss = self.train(model,self.data,optimizer,train_loader,dropout,epoch)
                    losses.append(loss.detach().cpu())
                    out, [train_acc_mi, test_acc_mi,val_acc_mi],[train_acc_ma, test_acc_ma,val_acc_ma] = self.test(model,self.data)
                    train_accs_mi.append(train_acc_mi)
                    test_accs_mi.append(test_acc_mi)
                    train_accs_ma.append(train_acc_ma)
                    test_accs_ma.append(test_acc_ma)
                    print(log.format(loss, epoch, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma))
        #np.save('../data_help/embedings_'+str(self.datasetname)+str(self.loss['name'])+'.npy', out.cpu().numpy())
                    
                     #scheduler.step()
        print(epoch, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma)
        plt.plot(losses)
        plt.title(name_of_plot+' loss')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.show()
        plt.plot(test_accs_mi)
        plt.title(name_of_plot+' test f1 micro')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.show()
                  
        plt.plot(test_accs_ma)
        plt.title(name_of_plot+' test f1 macro')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.show()
        return train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma

In [15]:
class MainOptuna(Main):
    def objective(self,trial):
        # Integer parameter
        hidden_layer = trial.suggest_categorical("hidden_layer", [32,64,128,256])
        out_layer = trial.suggest_categorical("out_layer", [32,64,128])
        dropout = trial.suggest_float("dropout", 0.0,0.5,step = 0.1)
        size = trial.suggest_categorical("size of network, number of convs", [1,2,3])
        Conv = self.Conv
        learning_rate= trial.suggest_float("lr",5e-3,1e-2)
        heads = trial.suggest_categorical('heads',[2])
        #c =trial.suggest_categorical("c",  [0.001, 0.01, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,20,30,100])
        hidden_layer_for_classifier = trial.suggest_categorical("hidden_layer_for_classifier", [32,64,128,256])
        number_of_layers_for_classifier = trial.suggest_categorical("number_of_layers_for_classifier", [1,2,3])
        # варьируем параметры
        model = Net(dataset = self.data,mode=self.mode,conv=Conv,loss_function={'name':'supervised'},device=device,hidden_layer=hidden_layer,out_layer =out_layer,num_layers = size,dropout = dropout,heads = heads)
        train_loader = NeighborSampler(self.data.edge_index, batch_size = int(sum(self.train_mask)),node_idx=self.train_mask, sizes=[-1]*size)
        print('after train loader', len(self.data.x), sum(self.train_mask), len(collections.Counter(self.data.edge_index[0].tolist())))

        model.to(self.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay = 1e-5)

        for epoch in range(50):
            loss = self.train(model,self.data,optimizer,train_loader,dropout,epoch)
        _, [train_acc_mi, test_acc_mi,val_acc_mi], [train_acc_ma, test_acc_ma,val_acc_ma] = self.test(model=model, data=self.data)
        trial.report(np.sqrt(val_acc_mi*val_acc_ma), epoch)
        return np.sqrt(val_acc_mi*val_acc_ma)

    def run(self,number_of_trials):

        study = optuna.create_study(direction="maximize")
        study.optimize(self.objective,n_trials = number_of_trials)
        trial = study.best_trial
        return trial.params

In [16]:
analysis = pd.DataFrame(columns=analysis.columns)

In [19]:
#analysis=pd.read_csv('../results/supervised.csv')
#analysis=analysis.drop(columns=['Unnamed: 0'])

device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for (l,f,cl,asp,ad) in datasets_names:
    name =  "".join(list(map(lambda x:str(x),  [l,f,cl,asp,ad])))
    if os.path.exists('../data_benchmark/graph_'+str(name)+'_attr.npy'):
        for conv in ['GCN']:
            if len(analysis[(analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function='HOPE_RPR',mode= 'supervised')
                best_values = MO.run(number_of_trials=500)

                M = Main(name=name,conv=conv, device=device, loss_function={'name':'HOPE_RPR'},mode= 'supervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma,l,f,cl,asp,ad],index = analysis.columns)
                #analysis = analysis.append(to_append,ignore_index=True)
               # analysis.to_csv('../results/supervised.csv')

[32m[I 2022-12-07 16:58:53,025][0m A new study created in memory with name: no-name-b01188af-19ad-4840-988e-82cae3878f9d[0m


after train loader 1000 tensor(701) 989
(701, 701)
(701, 701)


[33m[W 2022-12-07 16:58:53,431][0m Trial 0 failed because of the following error: RuntimeError('mat1 and mat2 shapes cannot be multiplied (701x128 and 256x128)')[0m
Traceback (most recent call last):
  File "C:\Users\User\Desktop\Benchmarking-Loss-Functions\venv\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_3008\3462569007.py", line 23, in objective
    loss = self.train(model,self.data,optimizer,train_loader,dropout,epoch)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_3008\1264338756.py", line 43, in train
    out = model.forward(data.x[n_id].to(self.device), adjs)
  File "C:\Users\User\Desktop\Benchmarking-Loss-Functions\modules\model.py", line 115, in forward
    for j in range(self.number_of_layers_for_classifier):
  File "C:\Users\User\Desktop\Benchmarking-Loss-Functions\venv\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_

RuntimeError: mat1 and mat2 shapes cannot be multiplied (701x128 and 256x128)

In [None]:
len(analysis)

In [None]:
analysis = pd.read_csv('../results/final_data.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])
analysis = analysis[(analysis['loss'] == 'features') & (analysis['label assortativity']==0.1) & (analysis['cluster coefficient']==0.2) & (analysis['feature assortativity']==0.1) & (analysis['average shortest path']==3) & (analysis['average degree']==5)]
analysis['test acc micro']

In [None]:
analysis

In [None]:
loss = LINE
loss_name = 'LINE'
device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])
conv = 'GCN'
for (l,f,cl,asp,ad) in datasets_names:
    name =  "".join(list(map(lambda x:str(x),  [l,f,cl,asp,ad])))
    if os.path.exists('../data_benchmark/graph_'+str(name)+'_attr.npy'):
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]

                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')


In [None]:
datasets_names=[(0.9, 0.5, 0.01, 2, 5)]

In [None]:
datasets_names

In [None]:
loss = LapEigen
loss_name = 'LapEigen'
device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])
conv = 'GCN'
for (l,f,cl,asp,ad) in datasets_names:
    name =  "".join(list(map(lambda x:str(x),  [l,f,cl,asp,ad])))
    if os.path.exists('../data_benchmark/graph_'+str(name)+'_attr.npy'):
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]


                loss_trgt["lmbda"] = best_values['lmbda']


                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')


In [None]:
loss = HOPE_CN
loss_name = 'HOPE_CN'
device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis=analysis.drop(columns=['Unnamed: 0'])
conv = 'GCN'
for (l,f,cl,asp,ad) in datasets_names:
    name =  "".join(list(map(lambda x:str(x),  [l,f,cl,asp,ad])))
    if os.path.exists('../data_benchmark/graph_'+str(name)+'_attr.npy'):
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]


                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')


In [None]:
loss = Node2Vec
loss_name = 'Node2Vec'
device= 'cpu'
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])

for name in ['Cornell']:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]


                loss_trgt["walks_per_node"] = best_values['walk_length']
                loss_trgt["walk_length"] = best_values['walk_length']
                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["context_size"] = best_values['context_size']
                loss_trgt["p"] = best_values['p']
                loss_trgt["q"] = best_values['q']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')


In [None]:
loss = VERSE_PPR
loss_name = 'VERSE_PPR'

device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])
for name in datasets_names[:2]:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]


                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["alpha"] = best_values['alpha']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')


In [None]:
loss = VERSE_Adj
loss_name = 'VERSE_Adj'
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])
for name in datasets_names[:2]:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]

                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')


In [None]:
#on real graphs
loss = Force2Vec
loss_name = 'Force2Vec'

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device='cpu'
for name in ['Cornell']:
    for conv in ['GCN']:
         if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                MO = MainOptuna(name = name, conv = conv, device = device, loss_function = loss, mode = 'unsupervised')
                best_values = MO.run(number_of_trials =500)

                loss_trgt = dict()
                for par in loss:
                    loss_trgt[par]=loss[par]

                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name, conv=conv, device=device, loss_function=loss_trgt, mode='unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')


In [None]:
loss = VERSE_SR
loss_name = 'VERSE_SR'
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])
for name in ['Cornell']:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]


                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('../results/classification_catboost.csv')

In [None]:
loss = VERSE_Adj
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
analysis = pd.read_csv('../results/classification_catboost.csv')
analysis = analysis.drop(columns=['Unnamed: 0'])
for name in ['Cornell']:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]

                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('classification_catboost.csv')


In [None]:
modkdjfjf

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device='cpu'
number_of_trials = 100
import os
for (l,f,cl,asp,ad) in datasets_names:
    name =  "".join(list(map(lambda x:str(x),  [l,f,cl,asp,ad])))
    if os.path.exists('../data_benchmark/graph_'+str(name)+'_attr.npy'):
        if len(analysis[(analysis['la'] == l)&(analysis['fa']==f)&(analysis['cl']==cl)&(analysis['asp']==asp)&(analysis['ad']==ad)] ) == 0:
            data, train_indices,val_indices,test_indices,train_mask,val_mask,test_mask = data_load(name)
            x = data.x.detach()
            y = data.y.detach()
            def objective(trial):
            # Integer parameter
                c = trial.suggest_categorical("c",  [0.001, 0.01, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,20,30,100])
                clf = LogisticRegression(max_iter = 3000, C=c).fit(x[train_mask].numpy(), y[train_mask].numpy())

                accs_micro = []
                accs_macro = []
                for mask in [train_mask,test_mask,val_mask]:
                    accs_micro += [accuracy_score(data.y.detach()[mask].numpy(),clf.predict(x[mask].numpy()))]
                    accs_macro += [accuracy_score(data.y.detach()[mask].numpy(),clf.predict(x[mask].numpy()))]

                return np.sqrt(accs_micro[2]*accs_macro[2])

            study = optuna.create_study(direction="maximize")
            study.optimize(objective, n_trials = number_of_trials)
            trial = study.best_trial
            c=trial.params['c']
            clf = LogisticRegression(max_iter = 3000, C=c).fit(x[train_mask].numpy(), y[train_mask].numpy())
            accs_micro = []
            accs_macro = []
            for mask in [train_mask,test_mask,val_mask]:
                accs_micro += [f1_score(y[mask].numpy(),clf.predict(x[mask].numpy()), average='micro')]
                accs_macro += [f1_score(y[mask].numpy(),clf.predict(x[mask].numpy()), average='macro')]

            to_append = pd.Series([l,f,cl,asp,ad, accs_micro[0],accs_micro[1], accs_macro[0] , accs_macro[1]],index = analysis.columns)
            analysis = analysis.append(to_append, ignore_index=True)
            analysis.to_csv('classification_on_features.csv')


In [None]:
loss = Force2Vec
loss_name = 'Force2Vec'
for (l,f,cl,asp,ad) in datasets_names:
    name =  "".join(list(map(lambda x:str(x),  [l,f,cl,asp,ad])))
    if os.path.exists('../data_benchmark/graph_'+str(name)+'_attr.npy'):
        print('hey')
        for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name = name, conv = conv, device = device, loss_function = loss, mode = 'unsupervised')
                best_values = MO.run(number_of_trials = 500)

                loss_trgt = dict()
                for par in loss:
                    loss_trgt[par]=loss[par]

                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_force2vec.csv')


In [None]:
loss = VERSE_Adj
loss_name = 'VERSE_Adj'

for name in datasets_names:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:

                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]

                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')


In [None]:
loss = VERSE_SR
loss_name = 'VERSE_SR'

for name in datasets_names:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')

In [None]:
loss = VERSE_PPR
loss_name = 'VERSE_PPR'

for name in datasets_names:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["alpha"] = best_values['alpha']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')


In [None]:
loss = LapEigen
loss_name = 'LapEigen'

for name in datasets_names:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["lmbda"] = best_values['lmbda']


                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = LINE
loss_name = 'LINE'

for name in datasets_names:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = GraphFactorization
loss_name = 'GraphFactorization'

for name in datasets_names:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi, train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = HOPE_CN
loss_name = 'HOPE_CN'

for name in datasets_names:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = HOPE_AA
loss_name = 'HOPE_AA'

for name in datasets_names:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = HOPE_RPR
loss_name = 'HOPE_RPR'

for name in datasets_names:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["alpha"] = best_values['alpha']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = HOPE_Katz
loss_name = 'HOPE_Katz'

for name in datasets_names:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["betta"] = best_values['betta']
                loss_trgt["lmbda"] = best_values['lmbda']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = Node2Vec
loss_name = 'Node2Vec'
device = 'cpu'
for name in ['chameleon']:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["walks_per_node"] = best_values['walk_length']
                loss_trgt["walk_length"] = best_values['walk_length']
                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["context_size"] = best_values['context_size']
                loss_trgt["p"] = best_values['p']
                loss_trgt["q"] = best_values['q']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = DeepWalk
loss_name = 'DeepWalk'
device='cpu'
for name in ['Citeseer']:
    for conv in ['GCN','GAT','SAGE']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["walks_per_node"] = best_values['walk_length']
                loss_trgt["walk_length"] = best_values['walk_length']
                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["context_size"] = best_values['context_size']
                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                

In [None]:
loss = APP
loss_name = 'APP'

for name in datasets_names:
    for conv in ['GCN']:
            if len(analysis[ (analysis['loss'] == loss_name) & (analysis['conv'] == conv) & (analysis['dataset'] == name)] ) == 0:
                
                MO = MainOptuna(name=name,conv=conv, device=device, loss_function=loss,mode= 'unsupervised')
                best_values=MO.run(number_of_trials=500)

                loss_trgt=dict()
                for par in loss:
                    loss_trgt[par]=loss[par]
   
                
                loss_trgt["num_negative_samples"] = best_values['num_negative_samples']
                loss_trgt["alpha"] = best_values['alpha']

                M = Main(name=name,conv=conv, device=device, loss_function=loss_trgt,mode= 'unsupervised')
                train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma = M.run(best_values)

                to_append=pd.Series([loss_name, conv,name, train_acc_mi, test_acc_mi,train_acc_ma , test_acc_ma],index = analysis.columns)
                analysis = analysis.append(to_append,ignore_index=True)
                analysis.to_csv('data_analysis_realdata.csv')
                