In [1]:
!nvidia-smi

Tue Jan  7 21:59:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.77.01              Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti     On  |   00000000:01:00.0  On |                  N/A |
|  0%   32C    P8             14W /  240W |    3659MiB /   8192MiB |     19%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
import torch, os, time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import BertTokenizer

from src.Preprocessing import preprocessing_dataframe
from src.Tokenizer import MyTokenizer
from src.DataLoader import DataLoaderBert
from src.Model import MyBert, Bert_FineTuned
from src.Callback import EarlyStopping
from src.Training import model_train, model_eval

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 5
RANDOM_SEED = 23
VOCAB_PATH = "vocab_file.txt"


DATA_PATH = os.path.join("data", "datos.xlsx")
DATA_EXT_PATH = os.path.join("data", "reviews_ext.xlsx") 
DATA_SIN = os.path.join("data", "datos_sinonimos.xlsx")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_excel(DATA_PATH)
column_name_rev = df.columns.to_list()[len(df.columns.to_list())-1]
columns_to_keep = df.columns.to_list()[:2]

df_revisado = df[df[column_name_rev] == 'Revisado'][columns_to_keep]
df_revisado_eq = preprocessing_dataframe(df_revisado, False)
df_revisado.shape, df_revisado_eq.shape

((6052, 2), (2220, 2))

In [3]:
df_ext = pd.read_excel(DATA_SIN)[columns_to_keep]
df_ext_eq = preprocessing_dataframe(df_ext, False)

df_eq = pd.concat([df_revisado_eq, df_ext_eq], axis=0)

df_ext_eq.shape, df_eq.shape

((33015, 2), (35235, 2))

In [4]:
df_train , df_test = train_test_split(df_eq, test_size=0.3, random_state = RANDOM_SEED)
df_val , df_test = train_test_split(df_test, test_size=0.5, random_state = RANDOM_SEED)
len(df_train), len(df_val), len(df_test)

(24664, 5285, 5286)

In [5]:
tokenizer = MyTokenizer(VOCAB_PATH)
MAX_LEN = 512
BATCH_SIZE = 4
VOCAB_SIZE = 20000
DROPOUT = 0.1
EMBED_DIM , ATT_HEADS, D_FF = 768, 12, 3072

model = MyBert(VOCAB_SIZE, MAX_LEN, EMBED_DIM, ATT_HEADS, D_FF, DROPOUT, NUM_LABELS, device, N=1).to(device)
model.load_state_dict(torch.load(os.path.join('model', 'bestMyBert.bin'), weights_only=True))

<All keys matched successfully>

In [6]:
def get_predictions(model, data_loader, device):
    model.eval()
    review_texts = []
    values, predictions, values_prob, predictions_prob = [], [], [], []

    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            review_texts.append(' '.join(texts))

            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            values.extend(targets.tolist())     

            output_model = model(input_ids, attention_mask)
            preds = torch.argmax(output_model, dim = 1)
            predictions.extend(preds.tolist())      
            
            output_model = F.softmax(output_model,dim=1)
            out_targ = output_model[torch.arange(targets.size(0)), targets]
            values_prob.extend(out_targ.tolist())
            out_preds = output_model[torch.arange(preds.size(0)), preds]
            predictions_prob.extend(out_preds.tolist())

    values_prob = [round(num, 3) for num in values_prob]
    predictions_prob = [round(num, 3) for num in predictions_prob]
    return review_texts, values, predictions, values_prob, predictions_prob

In [8]:
dataset_eval_torch = DataLoaderBert(df_test['Review'].to_list(), df_test['Score_G'].to_list(),tokenizer, MAX_LEN, include_raw_text=True)
eval_dataloader = DataLoader(dataset_eval_torch, batch_size=1, shuffle=True)

In [9]:
reviews, values, predictions, values_prob, predictions_prob = get_predictions(model, eval_dataloader, device)
print(reviews[0])
values[0], predictions[0], values_prob[0], predictions_prob[0]

pedí unos caracoles y la verdad es que se pasaron de coccion , bastante secos y duros , la salsa brava y las patatas muy mejorables , por el resto el váter correcto y los precios un poco elevados , local amplio , tipico para hacer cenas de grupos


(2, 2, 0.371, 0.371)

In [13]:
name_model = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = BertTokenizer.from_pretrained(name_model)
MAX_LEN = 512
DROPOUT = 0.1

model_bert = Bert_FineTuned(name_model, True, NUM_LABELS, DROPOUT).to(device)
model_bert.load_state_dict(torch.load(os.path.join('model', 'bestBert.bin'), weights_only=True))

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [7]:
def get_prediction_bert(model, review):
    #tokenizer = MyTokenizer(VOCAB_PATH)
    tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
    encoding = tokenizer.encode_plus(review,
                                          add_special_tokens=True, #Añade los tokens especiales que necesita BERT, [CLS] + [SEP]
                                          padding='max_length',  #Añade el pad para que todas las secuencias tengan la misma longitud
                                          truncation=True, #Trunca las secuancias más largas de 512, el BERT solo permite 512 tokens
                                          max_length=512,
                                          return_token_type_ids=False,  #Devuelve si un token pertenece a la primera o a la segunda secuencia
                                          return_attention_mask=True,
                                          return_tensors='pt')

    inputs_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    output_model = model(inputs_ids, attention_mask)
    preds = torch.argmax(output_model, dim = 1).tolist()
    return preds[0]

In [27]:
df = pd.DataFrame(columns=['Review', 'Punt_real', 'Punt_pred', 'Prob_real', 'Prob_pred', 'Pred_BERT'])

for i, rev in enumerate(reviews):
    if abs(values[i] - predictions[i]) >= 3:
        res = {
            'Review': rev,
            'Punt_real': values[i],
            'Punt_pred': predictions[i],
            'Prob_real': values_prob[i],
            'Prob_pred': predictions_prob[i],
            'Pred_BERT': get_prediction_bert(model_bert, rev)
        }
        df.loc[len(df)] = res

        
df.to_excel('predicciones_modelos.xlsx', index=False)
len(df)

311

In [28]:
print(f"Porcentaje de datos con una diferencia de 3 o más: {round(len(df)/len(reviews),3)}")

Porcentaje de datos con una diferencia de 3 o más: 0.059
