In [1]:
import os, time, sys
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

sys.path.insert(0, os.pardir)
from VocabModel import MyVocabModel
from src.Tokenizer import MyTokenizer
from src.Preprocessing import preprocessing_dataframe
from src.DataLoader import DataLoaderBert
from src.Model import MyBert
from src.Callback import EarlyStopping
from src.Training import model_train, model_eval

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RANDOM_SEED = 23
NUM_LABELS = 5
REM_STOP_WORDS = False

COLUMNS_XLSX = ['Vocab_size', 'Time', 'Train_loss', 'Train_acc', 'Val_loss', 'Val_acc']

DATA_PATH = os.path.join(os.pardir, os.path.join("data", "datos.xlsx"))
DATA_SIN = os.path.join(os.pardir, os.path.join("data", "datos_sinonimos.xlsx"))
NORMAL_PATH = "vocab_file.txt"
SAVE_PATH_R = "resultados_vocab.xlsx"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
CLEAN_XLSX = False
if CLEAN_XLSX:
    data_frame = pd.DataFrame(columns=COLUMNS_XLSX)
    data_frame.to_excel(SAVE_PATH_R, index=False)

In [3]:
df = pd.read_excel(DATA_PATH)
column_name_rev = df.columns.to_list()[len(df.columns.to_list())-1]
columns_to_keep = df.columns.to_list()[:2]

df_revisado = df[df[column_name_rev] == 'Revisado'][columns_to_keep]
df_revisado_eq = preprocessing_dataframe(df_revisado,REM_STOP_WORDS)
df_revisado.shape, df_revisado_eq.shape

((6052, 2), (2220, 2))

In [4]:
df_ext = pd.read_excel(DATA_SIN)[columns_to_keep]
df_ext_eq = preprocessing_dataframe(df_ext, False)

df_eq = pd.concat([df_revisado_eq, df_ext_eq], axis=0)

df_ext_eq.shape, df_eq.shape

((33015, 2), (35235, 2))

In [5]:
df_train , df_test = train_test_split(df_eq, test_size=0.3, random_state = RANDOM_SEED)
df_val , df_test = train_test_split(df_test, test_size=0.5, random_state = RANDOM_SEED)
len(df_train), len(df_val), len(df_test)

(24664, 5285, 5286)

In [6]:
df_rev = pd.concat([df_revisado, df_ext], axis=0)
vocab_transform = MyVocabModel(df_rev['Review'])

def max_len_corpus():
    max_len = 0
    for rev in df_revisado['Review']:
        n = len(vocab_transform.transform_txt(rev))
        if n > max_len:
            max_len = n
    
    return max_len

frase = "no me gustaria puntuar tan bajo"
frase_split = vocab_transform.transform_txt(frase)

#Tiene en cuenta los espacios para la longitud
print(f"Longitud de la frase: {len(frase)}, Longitud frase por palabras: {len(frase_split)}")

#Si la longitud maxima es 512, en el mejor de los casos, cuando cada token es una palabra.
#Todos los comentarios se podrian representar con toda la información. 
print(f"Longitud máxima de frase en el corpus: {max_len_corpus()}")

Longitud de la frase: 31, Longitud frase por palabras: 6
Longitud máxima de frase en el corpus: 464


In [7]:
MAX_LEN = 512
BATCH_SIZE = 4
DROPOUT = 0.1
#He definido un directamente el embed_im, att_h, d_ff porque solo busco cual es el mejor tamaño posible.
#En otro apartado determinaré cuales son los mejores hiperparametros
POSSIBLE_VALUES = [(512,8,2048), (768,12,3072)]
EMBED_DIM, ATT_HEADS, D_FF = POSSIBLE_VALUES[0]


print("VOCAB SIZE")
result_vocab = pd.read_excel(SAVE_PATH_R)
#vocab_ize = 118 es el minimo vocabulario
VOCAB_SIZE = [118, 1000, 2500, 5000, 7500, 10000, 12500, 15000, 17500, 20000, 25000, 30000, 35000] 
for vocab_size in VOCAB_SIZE:
    torch.cuda.empty_cache()

    if vocab_size in result_vocab['Vocab_size'].values:
        continue
    
    vocabM = MyVocabModel(df_rev['Review'])
    vocab = vocabM.create_vocab(vocab_size)
    with open(NORMAL_PATH, "w") as archivo:
        for word in vocab:
            archivo.write(word + "\n")

    tokenizer = MyTokenizer(NORMAL_PATH)

    dataset_train_torch = DataLoaderBert(df_train['Review'].to_list(), df_train['Score_G'].to_list(),tokenizer, MAX_LEN)
    train_dataloader = DataLoader(dataset_train_torch, batch_size=BATCH_SIZE, shuffle=True)
    dataset_val_torch = DataLoaderBert(df_val['Review'].to_list(), df_val['Score_G'].to_list(),tokenizer, MAX_LEN)
    val_dataloader = DataLoader(dataset_val_torch, batch_size=BATCH_SIZE, shuffle=True)

    model = MyBert(vocab_size, MAX_LEN, EMBED_DIM, ATT_HEADS, D_FF, DROPOUT, NUM_LABELS, device, N=1).to(device)

    loss_fn = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(),lr=0.001)
    train_loss, train_acc, val_loss, val_acc = [], [], [], []
    best_acc = 0
    early_stopping = EarlyStopping()

    num_epochs = 1000
    inicio = time.perf_counter() 
    for epoch in range(num_epochs): 
       train_loss_epoch, train_acc_epoch = model_train(model, train_dataloader, loss_fn,optimizer, len(df_train), device)
       train_loss.append(train_loss_epoch)
       train_acc.append(train_acc_epoch)

       val_loss_epoch, val_acc_epoch, _ = model_eval(model, val_dataloader, loss_fn, len(df_val), device)
       val_loss.append(val_loss_epoch)
       val_acc.append(val_acc_epoch)

       print(f"Num epoch: {epoch+1}, Train_loss: {train_loss_epoch}")

       if early_stopping(val_loss_epoch, model):
          break
    
    final = time.perf_counter()

    print("-"*30)
    print(f"Modelo {len(result_vocab)} done")
    print("-"*30)

    res = {
        'Vocab_size': vocab_size,
        'Time': int(round(final - inicio,2)/60),
        'Train_loss': sorted(train_loss)[0],
        'Train_acc': sorted(train_acc, reverse=True)[0].item(),
        'Val_loss': sorted(val_loss)[0],
        'Val_acc': sorted(val_acc, reverse=True)[0].item()
    }

    result_vocab.loc[len(result_vocab)] = res
    result_vocab.to_excel(SAVE_PATH_R, index=False)
    

len(result_vocab)

VOCAB SIZE
Num epoch: 1, Train_loss: 1.5801771372856352
Num epoch: 2, Train_loss: 1.5578734594005446
Num epoch: 3, Train_loss: 1.5526873993285863
Num epoch: 4, Train_loss: 1.5504395907338162
Num epoch: 5, Train_loss: 1.5508423595992227
Num epoch: 6, Train_loss: 1.5480998216648207
Num epoch: 7, Train_loss: 1.547372654868224
Num epoch: 8, Train_loss: 1.545501442725806
Num epoch: 9, Train_loss: 1.5452126778451956
Num epoch: 10, Train_loss: 1.5428284219805388
Num epoch: 11, Train_loss: 1.5426647584719817
Num epoch: 12, Train_loss: 1.541228585010452
Num epoch: 13, Train_loss: 1.5378589642790281
Num epoch: 14, Train_loss: 1.538886215444745
Num epoch: 15, Train_loss: 1.5389981310049745
Num epoch: 16, Train_loss: 1.5362272997432973
Num epoch: 17, Train_loss: 1.5350179427094635
Num epoch: 18, Train_loss: 1.5313107954810048
Num epoch: 19, Train_loss: 1.5336907877962271
Num epoch: 20, Train_loss: 1.5376474498697121
Num epoch: 21, Train_loss: 1.5347779531719388
Num epoch: 22, Train_loss: 1.5281564

13

In [1]:
SAVE_PATH_V = os.path.join(os.pardir, "vocab_file.txt")
BEST_VOCAB_SIZE = 20000

vocabM = MyVocabModel(df_revisado['Review'])
vocab = vocabM.create_vocab(BEST_VOCAB_SIZE)
"""""" 
with open(SAVE_PATH_V, "w") as archivo:
    for word in vocab:
        archivo.write(word + "\n")
       
print("Vocab done")

NameError: name 'os' is not defined