In [1]:
import sys, os
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

sys.path.insert(0, os.path.join(os.pardir,os.pardir))
from src.Preprocessing import preprocessing_dataframe
from src.Tokenizer import MyTokenizer
from src.DataLoader import DataLoaderBert
from src.Callback import EarlyStopping

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 5
RANDOM_SEED = 23

DATA_PATH = os.path.join(os.pardir,os.pardir, os.path.join("data", "datos.xlsx"))
VOCAB_PATH = os.path.join(os.pardir,os.pardir,"vocab_file.txt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_excel(DATA_PATH)
column_name_rev = df.columns.to_list()[len(df.columns.to_list())-1]
columns_to_keep = df.columns.to_list()[:2]

df_revisado = df[df[column_name_rev] == 'Revisado'][columns_to_keep]
df_revisado_eq = preprocessing_dataframe(df_revisado,False)
df_revisado.shape, df_revisado_eq.shape

In [None]:
df_train , df_test = train_test_split(df_revisado_eq, test_size=0.3, random_state = RANDOM_SEED)
df_val , df_test = train_test_split(df_test, test_size=0.5, random_state = RANDOM_SEED)
len(df_train), len(df_val), len(df_test)

In [2]:
tokenizer = MyTokenizer(VOCAB_PATH)
BATCH_SIZE = 1
MAX_LEN = 512

dataset_train_torch = DataLoaderBert(df_train['Review'].to_list(), df_train['Score_G'].to_list(),tokenizer, MAX_LEN)
train_dataloader = DataLoader(dataset_train_torch, batch_size=BATCH_SIZE, shuffle=True)

dataset_val_torch = DataLoaderBert(df_val['Review'].to_list(), df_val['Score_G'].to_list(),tokenizer, MAX_LEN)
val_dataloader = DataLoader(dataset_val_torch, batch_size=BATCH_SIZE, shuffle=True)

In [3]:
class RNN_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_labels, bidirectional=True, dropout=0.1):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        #Bidirectional sirve para evitar el vanish gradient problem
        self.D = 2 if bidirectional else 1
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=bidirectional)

        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        n = self.D*self.hidden_size
        n2 = n // 2
        self.linear = nn.Linear(n,n2)
        self.last_layer = nn.Linear(n2,num_labels)

    def forward(self, input):
        #[batch_size, input_size] -> [batch_size, seq_len, input_size] where seq_len = 1
        x = input.unsqueeze(1) 

        #[D*num_layers, batch_size, hidden_size]
        h0 = torch.zeros(self.D*self.num_layers, x.size(0), self.hidden_size).to(x.device)

        #x debe ser: (batch_size, seq, input_size)
        output, h_n = self.gru(x, h0) #[batch_size, seq_len, D*hidden_size]

        output = output[:, -1, :]
        output = self.dropout(output)
        output = self.linear(output)
        output = self.dropout2(output)
        output = self.last_layer(output)
        return output


In [4]:
def model_train(model, data, loss_fn, optimizer, n_examples, device):
  model.train()
  running_loss_train = 0.0
  correct_pred = 0

  for d in data:
    inputs_ids = d["input_ids"].float().to(device)
    #attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    
    optimizer.zero_grad()

    output_model = model(inputs_ids)
    
    loss = loss_fn(output_model,targets)
      
    loss.backward()
    optimizer.step()

    running_loss_train += loss.item()

    preds = torch.argmax(output_model, dim = 1)
    correct_pred += torch.sum(preds == targets).cpu() 
      
  return running_loss_train/len(data), correct_pred/ n_examples

In [5]:
def model_eval(model, data, loss_fn, n_examples, device):
  model.eval()
  running_loss_val = 0.0
  correct_pred = 0
  lista_dif = []  
    
  with torch.no_grad():
    for d in data:
      inputs_ids = d["input_ids"].float().to(device)
      #attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      output_model = model(inputs_ids)

      loss = loss_fn(output_model,targets)
      running_loss_val += loss.item()

      preds = torch.argmax(output_model, dim = 1)
      correct_pred += torch.sum(preds == targets).cpu()

        
      num_dif = torch.abs(targets - preds).tolist()
        
      output_model = F.softmax(output_model,dim=1)
      out_targ = output_model[torch.arange(targets.size(0)), targets]
      out_preds = output_model[torch.arange(preds.size(0)), preds]
      prob_dif = torch.abs(out_targ - out_preds).tolist()
        
      lista_dif += [val for val in zip(num_dif, prob_dif)]

  return running_loss_val / len(data), correct_pred/n_examples, lista_dif

In [6]:
HIDDEN_SIZE = 128
NUM_LAYERS = 2

model = RNN_GRU(input_size=MAX_LEN, hidden_size=HIDDEN_SIZE,num_layers=NUM_LAYERS, num_labels=NUM_LABELS).to(device)

loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(),lr=0.001)
train_loss, train_acc, val_loss, val_acc = [], [], [], []
best_acc = 0
best_hist = []
early_stopping = EarlyStopping()

num_epochs = 1000
for epoch in range(num_epochs): 
    train_loss_epoch, train_acc_epoch = model_train(model, train_dataloader, loss_fn, optimizer, len(df_train), device)
    train_loss.append(train_loss_epoch)
    train_acc.append(train_acc_epoch)

    val_loss_epoch, val_acc_epoch, hist_acc = model_eval(model, val_dataloader, loss_fn, len(df_val), device)
    val_loss.append(val_loss_epoch)
    val_acc.append(val_acc_epoch)

    print(f"Num epoch: {epoch+1}, Train_loss: {train_loss_epoch}")
    
    if best_acc < val_acc_epoch:
        best_acc = val_acc_epoch
        best_hist = hist_acc
        
    if early_stopping(val_loss_epoch, model):
        break

print("-"*30)
res_model = {
    'Train_loss': sorted(train_loss)[0],
    'Train_acc': sorted(train_acc, reverse=True)[0].item(),
    'Val_loss': sorted(val_loss)[0],
    'Val_acc': sorted(val_acc, reverse=True)[0].item()
}
res_model

Num epoch: 1, Train_loss: 1.6017548978026654
Num epoch: 2, Train_loss: 1.566065762712209
Num epoch: 3, Train_loss: 1.5523774811310005
Num epoch: 4, Train_loss: 1.5449700911135018
Num epoch: 5, Train_loss: 1.5322025632192087
Num epoch: 6, Train_loss: 1.5138796118511777
Num epoch: 7, Train_loss: 1.485172179484056
Num epoch: 8, Train_loss: 1.4624442958433523
Num epoch: 9, Train_loss: 1.4285466046766577
Num epoch: 10, Train_loss: 1.3830302465281106
Num epoch: 11, Train_loss: 1.3598115794363548
------------------------------


{'Train_loss': 1.3598115794363548,
 'Train_acc': 0.40596845746040344,
 'Val_loss': 1.5214033169789358,
 'Val_acc': 0.31081080436706543}