In [None]:
import collections
import os
import numpy as np
import optuna
from sklearn.metrics import f1_score
import pandas as pd
import matplotlib.pyplot as plt


import torch
from torch.nn import Linear
import torch.nn as nn
from torch.nn import Sequential as Seq

from torch_geometric.nn import GCNConv, SAGEConv,GATConv, APPNP, FAConv, SuperGATConv,GINConv
import torch.nn.functional as F
from torch_geometric.data import NeighborSampler
from torch_geometric.data import Data




In [None]:
# Функция для загрузки данных, полученных после использования нашего генератора (например с использованием файла examples/example_generate_graph или examples/example_generate_tuning)
def data_load(name):
    x =  torch.DoubleTensor(np.load('../dataset/graph_'+str(name)+'_attr.npy'))
    edge_list = torch.tensor(np.load('../dataset/graph_'+str(name)+'_edgelist.npy')).t()
    y =  torch.tensor(np.load('../dataset/graph_'+str(name)+'_labels.npy'),dtype=torch.long)
    data = Data(x=x,edge_index=edge_list,y=y)
    train_indices = torch.tensor(np.random.choice(list(np.arange(len(data.x))),int(len(data.x)*0.7), replace = False),dtype=torch.long)
    rest_indices = list(set(list(range(len(data.x)))) - set(train_indices.tolist()))
    val_indices = torch.tensor(np.random.choice(rest_indices,int(len(data.x)*0.2), replace = False),dtype=torch.long)
    test_indices = torch.tensor(list(set(rest_indices) - set(val_indices.tolist())),dtype=torch.long)

    #делаем masks
    train_mask = torch.tensor([False]*len(data.x))
    train_mask[train_indices]=torch.tensor([True]*len(train_indices))
    val_mask = torch.tensor([False]*len(data.x))
    val_mask[val_indices]=torch.tensor([True]*len(val_indices))
    test_mask = torch.tensor([False]*len(data.x))
    test_mask[test_indices]=torch.tensor([True]*len(test_indices))
    return data, train_indices,val_indices,test_indices,train_mask,val_mask,test_mask


In [None]:
#Класс для формирования модели машинного обучения=графовой нейронной сети
class Net(torch.nn.Module):
    def __init__(self, dataset, device,conv='GCN',hidden_layer=64,out_layer =128,dropout = 0,num_layers=2,**kwargs):
        super(Net, self).__init__()
        self.conv = conv
        self.num_layers = num_layers
        self.data = dataset
        self.num_features = dataset.x.shape[1]
        self.convs = torch.nn.ModuleList()
        self.hidden_layer = hidden_layer
        self.out_layer = out_layer
        self.dropout = dropout
        self.device = device
        self.alpha = 0.2
        if self.conv == 'APPNP':
            self.alpha=kwargs["alpha"]


        out_channels =len(collections.Counter(self.data.y.tolist()).keys())
        if self.conv == 'GCN':
            if self.num_layers == 1:
                self.convs.append(GCNConv(self.num_features, out_channels))
            else:
                self.convs.append(GCNConv(self.num_features, self.hidden_layer))
                for i in range(1,self.num_layers-1):
                    self.convs.append(GCNConv(self.hidden_layer, self.hidden_layer))
                self.convs.append(GCNConv(self.hidden_layer, out_channels))
        if self.conv == 'GIN':


            if self.num_layers == 1:
                module = Seq(Linear(self.num_features, self.hidden_layer), nn.ReLU(), Linear(self.hidden_layer, out_channels))
                self.convs.append(GINConv(module,train_eps=True))
            else:
                module = Seq(Linear(self.num_features, self.hidden_layer), nn.ReLU())
                self.convs.append(GINConv(module,train_eps=True))
                for i in range(1,self.num_layers-1):
                    module = Seq(nn.Linear(self.hidden_layer, self.hidden_layer), nn.ReLU())
                    self.convs.append(GINConv(module,train_eps=True))
                module = Seq(nn.Linear(self.hidden_layer, out_channels))
                self.convs.append(GINConv(module,train_eps=True))

        elif self.conv=='APPNP':
            self.convs.append(Linear(self.num_features, self.hidden_layer))
            self.convs.append(Linear(self.hidden_layer, self.out_layer))
            self.convs.append(APPNP(self.num_layers,self.alpha, self.dropout))#self.num_features, out_channels))

        elif self.conv == 'SAGE':

            if self.num_layers == 1:
                self.convs.append(SAGEConv(self.num_features, out_channels))
            else:
                self.convs.append(SAGEConv(self.num_features, self.hidden_layer))
                for i in range(1,self.num_layers-1):
                    self.convs.append(SAGEConv(self.hidden_layer, self.hidden_layer))
                self.convs.append(SAGEConv(self.hidden_layer, out_channels))
        elif self.conv == 'GAT':
            if self.num_layers == 1:
                self.convs.append(GATConv(self.num_features, out_channels))
            else:
                self.convs.append(GATConv(self.num_features, self.hidden_layer))
                for i in range(1,self.num_layers-1):
                    self.convs.append(GATConv(self.hidden_layer, self.hidden_layer))
                self.convs.append(GATConv(self.hidden_layer, out_channels))

        elif self.conv == 'FA':


            self.convs.append(Linear(self.num_features, self.hidden_layer))
            for i in range(self.num_layers):
                    self.convs.append(FAConv(self.hidden_layer, self.hidden_layer))
            self.convs.append(Linear(self.hidden_layer, out_channels))

        elif self.conv == 'SuperGAT':

            if self.num_layers == 1:
                self.convs.append(SuperGATConv(self.num_features, out_channels, heads=8,
                                  dropout=0.6, attention_type='MX',
                                  edge_sample_ratio=0.8, is_undirected=True))
            else:
                self.convs.append(SuperGATConv(self.num_features, self.hidden_layer, heads=8,
                                  dropout=0.6, attention_type='MX',
                                  edge_sample_ratio=0.8, is_undirected=True))
                for i in range(1,self.num_layers-1):
                    self.convs.append(SuperGATConv(self.hidden_layer, self.hidden_layer, heads=8,
                                  dropout=0.6, attention_type='MX',
                                  edge_sample_ratio=0.8, is_undirected=True))
                self.convs.append(SuperGATConv(self.hidden_layer, out_channels, heads=8,
                                  dropout=0.6, attention_type='MX',
                                  edge_sample_ratio=0.8, is_undirected=True))

        self.reset_parameters()

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()

    def forward(self,x,adjs):
        if self.conv == 'FA':
                x = self.convs[0](x)
                x = x.relu()
                x = F.dropout(x, p=self.dropout, training=True)
                x_o = x
                for i, (edge_index, _, size) in enumerate(adjs):
                        x_target = x[:size[1]]  # Target nodes are always placed first.
                        x = self.convs[i+1](x,x_o, edge_index)
                        x = F.dropout(x, p=self.dropout, training=True)
                x=self.convs[len(self.convs)-1](x)
                x = F.dropout(x, p=self.dropout, training=True)
        else:
            for i, (edge_index, _, size) in enumerate(adjs):
                x_target = x[:size[1]]  # Target nodes are always placed first.
              #  print(x,'\t',x_target,'\t',edge_index,'\t',x.type(),'\t',x_target.type(),'\t',edge_index.type())

                x = self.convs[i]((x,x_target), edge_index)
                if i != self.num_layers - 1:
                    x = F.relu(x)
                    x = F.dropout(x, p=self.dropout, training=self.training)
        return x.log_softmax(dim=1)

    def inference(self,data,dp=0,t=True):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        for i, conv in enumerate(self.convs):
            if self.conv=='APPNP' and i==0:
                x=conv(x)
                x = x.relu()
                x = F.dropout(x, p=dp, training=t)
            elif self.conv=='APPNP' and i==1:
                x=conv(x)
            elif self.conv=='APPNP' and i>1:
                x = conv(x, edge_index)
            elif self.conv =='FA' and i == 0:
                x=conv(x)
                x = x.relu()
                x = F.dropout(x, p=dp, training=t)
                x_o=x

            elif self.conv =='FA' and i == len(self.convs)-1:
                x=conv(x)
                x = F.dropout(x, p=dp, training=t)
            elif self.conv == 'FA' and (i!=0 and i!=len(self.convs)-1):
                x = conv(x, x_o, edge_index)
                x = x.relu()
                x = F.dropout(x, p=dp, training=t)
            else:
                x = conv(x, edge_index)
                if i != self.num_layers - 1:
                    x = x.relu()
                    x = F.dropout(x, p=dp, training=t)
        return x.log_softmax(dim=-1)

    def loss_sup(self, pred, label):
        return F.nll_loss(pred, label)

In [None]:
#непосредственно пайплайны тестирования

class Main():
    def __init__(self,name, conv, device):
        data, train_indices,val_indices,test_indices,train_mask,val_mask,test_mask = data_load(name)
        self.Conv = conv
        self.device = device
        self.x = data.x
        self.y = data.y.squeeze()
        self.data=data.to(device)
        
        self.train_mask = train_mask#torch.tensor([False]*len(indices))
        self.test_mask = test_mask#torch.tensor([False]*len(indices))
        self.val_mask =val_mask# torch.tensor([False]*len(indices))
        
        super(Main, self).__init__()
    
    def train(self, model,data,optimizer,train_loader,dropout,epoch,alpha):
        model.train()   
        total_loss = 0
        optimizer.zero_grad()
        if model.conv=='GCN'  or model.conv =='APPNP' or model.conv == 'FA' or model.conv == 'SuperGAT':
                
                out = model.inference(data.to(device),dp=dropout,t=True)
                y=self.y.to(self.device)
                loss = model.loss_sup(out[self.train_mask],y[self.train_mask])
                
                total_loss+=loss
        else:
            for batch_size, n_id, adjs in train_loader:
                    if len(train_loader.sizes) == 1:
                        adjs = [adjs]
                    adjs = [adj.to(device) for adj in adjs]
                    
                
                    out = model.forward(data.x[n_id].to(device), adjs)
                    y=self.y.to(self.device)
                    loss = model.loss_sup(out,y[n_id[:batch_size]])
                    total_loss+=loss
        total_loss.backward(retain_graph=True)
        optimizer.step()      
        return total_loss /len(train_loader)       

    @torch.no_grad()
    def test(self, model,data,**kwargs):#,n_estimators,learning_rate_carboost, max_depth): 
        model.eval()
        out = model.inference(data.to(device),t=False)
        y_true = self.y.cpu().detach().numpy()
        self.y=self.y.cpu()
        y_true = self.y.unsqueeze(-1)
        y_pred = out.cpu().argmax(dim=-1, keepdim=True)
        accs_micro = []
        for mask in [self.train_mask,self.test_mask,self.val_mask]:
                accs_micro += [f1_score(self.y.detach()[mask].cpu().numpy(),y_pred[mask], average='weighted')]
                
                
        return accs_micro

    def run(self,hidden_layer=64,out_layer=128,dropout=0.0,size=1,learning_rate=0.001,alpha=0):
        
        train_loader = NeighborSampler(self.data.edge_index, node_idx=self.train_mask, batch_size = int(sum(self.train_mask)), sizes=[-1]*size)
        model = Net(dataset = self.data,conv = self.Conv,device = device,hidden_layer = hidden_layer,out_layer = out_layer,num_layers = (size),dropout = dropout,alpha=alpha)
        model.to(device)
        model.double()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay = 1e-5)
        #scheduler=lr_scheduler.StepLR(optimizer, step_size=25,gamma=0.1)
        losses=[]
        train_accs=[]
        test_accs=[]
        val_accs=[]
        name_of_plot='conv: '+model.conv
        print(name_of_plot)
        log = 'Loss: {:.4f}, Epoch: {:03d}, Train acc: {:.4f}, Test acc: {:.4f}'
         
        for epoch in range(100):
                    loss = self.train(model,self.data,optimizer,train_loader,dropout,epoch,alpha)
                    losses.append(loss.detach().cpu())
                    [train_acc, test_acc,val_acc]= self.test(model,self.data)
                    train_accs.append(train_acc)
                    test_accs.append(test_acc)
                    print(log.format(loss, epoch, train_acc, test_acc ))
                     #scheduler.step()
        print(log.format(loss, epoch, train_acc, test_acc))
        #print('Test acc on the last epoch micro ', test_acc_micro)
        #print(list(map(lambda x: float(x),losses)))
        plt.plot(losses)
        plt.title(name_of_plot+' loss')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.show()
        plt.plot(test_accs)
        plt.title(name_of_plot+' test f1 micro')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.show()
        return test_acc

In [None]:
class MainOptuna(Main):
    def objective(self,trial):
        # Integer parameter
        hidden_layer = trial.suggest_categorical("hidden_layer", [32,64,128,256])
        out_layer = trial.suggest_categorical("out_layer", [32,64,128])
        dropout = trial.suggest_float("dropout", 0.0,0.5,step = 0.1)
        size = trial.suggest_categorical("size of network, number of convs", [1,2,3])
        Conv = self.Conv
        learning_rate= trial.suggest_float("lr",5e-5,1e-2)
        alpha=0
   
        if Conv =='APPNP':
            alpha= trial.suggest_float("alpha", 0.1,1,step = 0.1)
        if Conv=='FA':
            eps = trial.suggest_float("eps", 0.1,1,step = 0.1)
            model = Net(dataset = self.data,eps=eps,conv=Conv,device=device,hidden_layer=hidden_layer,out_layer =out_layer,num_layers = size,dropout = dropout,alpha=alpha)
        else:
            model = Net(dataset = self.data,conv=Conv,device=device,hidden_layer=hidden_layer,out_layer =out_layer,num_layers = size,dropout = dropout,alpha=alpha)
        model.double()
        train_loader = NeighborSampler(self.data.edge_index, batch_size = int(sum(self.train_mask)),node_idx=self.train_mask, sizes=[-1]*size)
       # train_loader = NeighborSampler(self.data.edge_index, batch_size = int(sum(self.train_mask)), sizes=[-1]*size)
        model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay = 1e-5)  
        
            
        for epoch in range(100):
            #print('hi',self.data.x.shape)
            loss = self.train(model,self.data,optimizer,train_loader,dropout,epoch,alpha)
            #self.test(model,self.data,classifier,n_estimators=n_estimators,learning_rate_catboost=learning_rate_catboost,max_depth=max_depth)
        [train_acc, test_acc,val_acc]= self.test(model,self.data)
        trial.report( val_acc ,epoch)
        return np.sqrt(val_acc)

    
    def run(self,number_of_trials):

        study = optuna.create_study(direction="maximize",study_name=str(self.Conv)+" conv")
        study.optimize(self.objective,n_trials = number_of_trials)

        print('Best trial:')
        trial = study.best_trial
        print(" Value: ", trial.value)
        print(" Params: ")
        for key, value in trial.params.items():
            print(" {}: {}".format(key,value))
        return study.best_trial.params

In [None]:
datasets_names=[]
a=0
for l_a_trgt in [0.1,0.2,0.3,0.4,0.6,0.7,0.8,0.5,0.9]:
    for f_a_trgt in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.85,0.9,0.95]:
        for cl_trgt in [0.01,0.1,0.2,0.25,0.5]:
            for asp_trgt in [2,3,4,4.5,5,5.5,6,6.5,7]:
                for a_deg_trgt in [2,5,20,40]:

                    name =  "".join(list(map(lambda x:str(x),  [l_a_trgt,f_a_trgt,cl_trgt,asp_trgt,a_deg_trgt])))
                   # print(name)
                    if os.path.exists('../dataset/graph_'+str(name)+'_labels.npy'):
                        datasets_names.append((l_a_trgt,f_a_trgt,cl_trgt,asp_trgt,a_deg_trgt))
                        a+=1
a

In [None]:
#Тестируем метод APPNP на кусочке нашего датасета

df_results = pd.DataFrame(columns = ['label assort','feature assort','cluster','average shortest paths','average degree','conv','test accuracy'])
for (l,f,cl,asp,ad) in datasets_names:
    name =  "".join(list(map(lambda x:str(x),  [l,f,cl,asp,ad])))
    #print(name)
    if os.path.exists('../dataset/graph_'+str(name)+'_labels.npy'):
        for conv in ['APPNP']:
                #print(conv)
                if len(df_results[(df_results['conv'] == conv) & (df_results['label assort'] == l) & (df_results['feature assort'] == f) &(df_results['cluster'] == cl) & (df_results['average shortest paths'] == asp) & (df_results['average degree'] == ad)] ) == 0:
                    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                    print(device)
                    MO = MainOptuna(name,conv, device)

                    best_trial = MO.run(number_of_trials=250)
                    Model = Main(name,conv, device)

                    test_acc = Model.run(hidden_layer=best_trial['hidden_layer'],out_layer=best_trial['out_layer'],dropout=best_trial['dropout'],size=best_trial['size of network, number of convs'],learning_rate=best_trial['lr'])
                    print(test_acc)
                    to_append = [l,f,cl,asp,ad,conv,test_acc]
                    print(to_append)
                    row_series = pd.Series(to_append, index = df_results.columns)
                    df_results = df_results.append(row_series, ignore_index = True)
df_results