In [None]:
# Bibliotecas necessárias

import os
import json
import time
import torch
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from torch.nn import Linear
from itertools import chain
from torch_geometric.nn import GATv2Conv
from torch_geometric.data import DataLoader
from torch_geometric.nn import global_mean_pool

warnings.filterwarnings("ignore")
torch.cuda.empty_cache()

In [None]:
# Set the seed value all over the place to make this reproducible.

# Seed the behavior of the environment variable
os.environ['PYTHONHASHSEED'] = str(1)
# Seed numpy's instance in case you are using numpy's random number generator, shuffling operations, ...
np.random.seed(1)

# In general seed PyTorch operations
torch.manual_seed(0)
# If you are using CUDA on 1 GPU, seed it
torch.cuda.manual_seed(0)
# If you are using CUDA on more than 1 GPU, seed them all
torch.cuda.manual_seed_all(0)
# Disable the inbuilt cudnn auto-tuner that finds the best algorithm to use for your hardware.
torch.backends.cudnn.benchmark = False
# Certain operations in Cudnn are not deterministic, and this line will force them to behave!
torch.backends.cudnn.deterministic = True

In [None]:
# Faz a leitura dos grafos das bases de treino, validação e teste

train_data = torch.load("train_graphs.pt")
val_data = torch.load("val_graphs.pt")

In [None]:
%%time

# Considera que a classe é a resposta mais frequente
def redefine_target(base):

    df_filtro = pd.DataFrame(base)
    df_filtro[2] = df_filtro[2].apply(lambda x: x[1])
    df_filtro["y"] = df_filtro[2].apply(lambda x: pd.DataFrame(x).value_counts().index[0][0])

    for i in range(len(base)):
        base[i].y = df_filtro["y"][i]

    return base

In [None]:
# Função que pega as 3000 respostas mais frequentes da base de treino
# Representando 96% da base de treino e 90% da validação

def get_most_common_answers(train_data, neurons_final_layer):

    df_filtro = pd.DataFrame(train_data)
    df_filtro["img"] = list(range(df_filtro.shape[0]))
    df_filtro[2] = df_filtro[2].apply(lambda x: x[1])
    df_filtro = df_filtro.explode(2)
    df_filtro.rename(columns = {2: "answer"}, inplace = True)
    all_answers_train = df_filtro[["answer"]].value_counts().reset_index()
    top_k = all_answers_train.head(neurons_final_layer).answer.tolist()

    return all_answers_train, top_k

In [None]:
# Obtém as classes com base nos dados de treinamento
neurons_final_layer = 3000
all_answers_train, classes_train = get_most_common_answers(train_data, neurons_final_layer)

# Define o codificador e decodificador das classes a ser usado na etapa de treinamento/validação
encoder_label = {w: i for i,w in enumerate(classes_train)}
decoder_label = {w: i for w,i in enumerate(classes_train)}

In [None]:
print("Quantidade de classes usadas: ", neurons_final_layer)

In [None]:
# Filtra as bases para que sejam analisados somente grafos cuja resposta é conhecida,
# ou seja, está no top 3000 definido

def filter_base_based_classes_train(base_grafo, classes_train):

  indices = []
  for i in range(len(base_grafo)):

      if len(list(set(base_grafo[i].y)&set(classes_train))) != 0:
          indices.append(i)
  graphs = [base_grafo[i] for i in indices]

  return graphs

In [None]:
train_data = filter_base_based_classes_train(train_data, classes_train)
val_data = filter_base_based_classes_train(val_data, classes_train)

In [None]:
print("Tamanho da base de treino: ", len(train_data))
print("Tamanho da base de validação: ", len(val_data))

In [None]:
# Load the data sets into dataloader
# We will train the graph classification task on a batch of 32 graphs
train_loader = DataLoader(train_data, batch_size=64, shuffle=True, drop_last = True)
valid_loader = DataLoader(val_data, batch_size=32, shuffle=False)

In [None]:
def define_class_weights(all_answers_train, neurons_final_layer):

  all_answers_train = all_answers_train.head(neurons_final_layer)
  all_answers_train["weight"] = all_answers_train[0]/all_answers_train[0].sum()
  class_weights = all_answers_train["weight"].values.tolist()

  return class_weights

In [None]:
classes_weights = define_class_weights(all_answers_train, neurons_final_layer)

In [None]:
# Define a arquitetura do modelo: Graph Attention Network

class GAT(torch.nn.Module):

    def __init__(self, dim_in, dim_out):
        super().__init__()
        torch.manual_seed(12345)
        self.gat1 = GATv2Conv(dim_in, 300, heads=8, add_self_loops = False)
        self.dropout3 = nn.Dropout(0.2)
        self.gat2 = GATv2Conv(300*8, 300, heads=4, add_self_loops = False)
        self.dropout4 = nn.Dropout(0.2)
        self.layer_3 = torch.nn.Linear(300*4, dim_out)

    def forward(self, x, edge_index, batch):

        # 1. Obtain node embeddings
        x = self.gat1(x, edge_index)
        #features, att_weight = self.gat1(x, edge_index, return_attention_weights = True)
        x = self.dropout3(x)
        #x = self.dropout3(features)
        features, att_weight = self.gat2(x, edge_index, return_attention_weights = True)
        # 2. Readout layer
        x = global_mean_pool(features, batch)  # [batch_size, hidden_channels]
        x = self.dropout4(x)
        x = self.layer_3(x)
        x = F.relu(x)

        return x, att_weight, features

In [None]:
# Calcula a acurácia do modelo
def accuracy(pred, y_10):

    tot_acc = []
    for i in range(len(pred)):
        cur_acc = np.minimum(1.0, y_10[i].count(pred[i])/3.0)
        tot_acc.append(cur_acc)

    return np.mean(tot_acc)

In [None]:
# Realiza a inserção de logs com as informações sobre o treinamento do modelo
def insert_log(e, loss_train_mean, acc_train_mean, loss_val_mean, acc_val_mean):

  logs = open('logs.txt', 'a')
  log_epoch = f'Epoch {e} \t Training Loss: {loss_train_mean} \t Training Acc: {acc_train_mean} \t Validation Loss: {loss_val_mean} \t Validation Acc: {acc_val_mean}'+"\n"
  logs.write(log_epoch)
  logs.close()

def insert_time_execution(start, end):

  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)

  logs = open('logs.txt', 'a')
  log_time = 'Time execution:'+"{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)+"\n"
  logs.write(log_time)
  logs.close()

In [None]:
# Salva o modelo sempre que apresentar o melhor resultado
def checkpoint(model, e, path):

    torch.save({
            'epoch': e,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            }, path)

In [None]:
# Faz a representação da saída do modelo
def get_output_representation(encoder_label, y_10, classes_train, neurons_final_layer):

  df = pd.DataFrame({"answer":y_10}).value_counts().reset_index()
  df.rename(columns={0: "freq"}, inplace= True)
  df["answer_encode"] = df["answer"].apply(lambda x: encoder_label[x] if x in classes_train else pd.NA)
  df = df.dropna()
  df["percentage"] = df["freq"]/10

  targets_elem = np.zeros(neurons_final_layer)
  targets_elem[df.answer_encode.tolist()] = df.percentage.values

  return list(targets_elem)

In [None]:
# Define dispositivo utilizado para treinamento (GPU/ CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Dispositivo sendo usado: ", device)

In [None]:
# Define o pipeline de treinamento
def pipeline_train(model, loader, device, optimizer, e, decoder_label, encoder_label, classes_train,
                   targets_base, base, neurons_final_layer, classes_weights, criterion):

  predictions = []
  targets_multi = []
  base_loss = 0.0
  idx = 0

  for data in loader:

        # Transfer Data to GPU if available
        data = data.to(device)

        if base == "train":

            # Clear the gradients
            optimizer.zero_grad()

            # Forward Pass
            x,_, _ = model(data.x, data.edge_index, data.batch)

        else:

            with torch.no_grad():
                # Forward Pass
                x, _, _ = model(data.x, data.edge_index, data.batch)

        # predict answer
        pred = x.argmax(dim=1).cpu().numpy()

        predictions = predictions + [decoder_label[i] for i in pred]
        targets_multi = targets_multi + data.y

        # Find the Loss
        if e == 0:
          # Calcula os targets
          resps = [get_output_representation(encoder_label, i, classes_train, neurons_final_layer) for i in data.y]
          targets_base.append(torch.tensor(resps, dtype=torch.float))

        loss = criterion(x, targets_base[idx].to(device))

        if base == "train":
          # Calculate gradients
          loss.backward()
          # Update Weights
          optimizer.step()

        # Calculate Loss
        base_loss += loss.item()
        idx += 1

  # Compute accuracy metric
  acc = accuracy(predictions, targets_multi)

  return acc, base_loss, targets_base, model, optimizer, classes_weights

In [None]:
# Define o modelo a ser usado
model = GAT(train_data[0].num_features, neurons_final_layer)
model.to(device)

In [None]:
# Realiza o treinamento do modelo

# Declaring Optimizer
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(weight = torch.tensor(classes_weights).to(device))
optimizer = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=0.9)


#criterion = nn.CrossEntropyLoss(weight=classes_weights, reduction='none')

best_accuracy = -1
best_epoch = -1

# Training with Validation
epochs = 10

early_stop_thresh = epochs

targets_train = []
targets_val = []

classes_weights_train = []
classes_weights_val = []

start = time.time()
for e in tqdm(range(epochs)):

    model.train()
    acc_train, train_loss, targets_train, model, optimizer, classes_weights_train = pipeline_train(model, train_loader, device, optimizer, e, decoder_label,
                                                              encoder_label, classes_train,targets_train,
                                                              "train", neurons_final_layer, classes_weights, criterion)

    loss_train_mean = round((train_loss / len(train_loader)), 2)

    model.eval()     # Optional when not using Model Specific layer
    acc_val, val_loss, targets_val, model, optimizer, classes_weights_val = pipeline_train(model, valid_loader, device, optimizer, e, decoder_label,
                                                          encoder_label, classes_train, targets_val,
                                                          "val", neurons_final_layer, classes_weights, criterion)

    loss_val_mean = round((val_loss / len(valid_loader)), 2)

    if  acc_val >= best_accuracy:
        best_accuracy = acc_val
        best_epoch = e
        checkpoint(model, e, "best_model.pth")
    elif e - best_epoch > early_stop_thresh:
        print("Early stopped training at epoch %d" % e)
        break  # terminate the training loop

    print(f'Epoch {e} \t Training Loss: {loss_train_mean} \t Training Acc: {acc_train} \t Validation Loss: {loss_val_mean} \t Validation Acc: {acc_val}')
    insert_log(e, loss_train_mean, acc_train, loss_val_mean, acc_val)

end = time.time()
insert_time_execution(start, end)

In [None]:
test_data = torch.load("test_graphs.pt")

In [None]:
len(test_data)

In [None]:
# Define a avaliação sobre a base de teste
def inference_model(model, test_data, decoder_label, device):

  checkpoint = torch.load("best_model.pth")
  model.load_state_dict(checkpoint['model_state_dict'])
  test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

  results = []

  for data in tqdm(test_loader):

    data = data.to(device)
    name_img = data.y[0]
    model.eval()
    # Forward Pass
    x, _, _ = model(data.x, data.edge_index, data.batch)

    # predict answer
    pred = x.argmax(dim=1).cpu().numpy()
    pred = [decoder_label[i] for i in pred][0]

    results.append((name_img, pred))
  return results

In [None]:
# Realiza inferência sobre a base de teste
results = inference_model(model, test_data, decoder_label, device)

In [None]:
# Função para formatar e salvar os resultados da base de teste
def format_save_results(results):

  final_results = []

  for i in range(len(results)):
    name_img = results[i][0]
    answer = results[i][1]
    final_results.append({"image": name_img, "answer": answer})

  with open("results_test.json", "w") as outfile:
    json.dump(final_results, outfile)

In [None]:
# Salva resultados
format_save_results(results)