## **Análisis de sentimientos**

In [1]:
# Importamos librerias necesarias de transformers para analizar sentimientos
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap

# BertModel se utiliza para crear un modelo de lenguaje que puede ser utilizado para generar texto.
# BertTokenizer se utiliza para tokenizar el texto y convertirlo en tokens.
# AdamW es un optimizador que se utiliza para actualizar los pesos de la red neuronal.
# get_linear_schedule_with_warmup se utiliza para ajustar el learning rate de la red neuronal.

# sklearn es una librería de machine learning para Python.
# torch es una librería de machine learning para Python.
# numpy es una librería de machine learning para Python.
# textwrap es una librería de machine learning para Python.

In [2]:
# Inicializar unos parametros para el modelo
RANDOM_SEED = 42        # Es una semilla aleatoria para que el modelo se comporte de la misma manera
MAX_LEN = 200
TRAIN_BATCH_SIZE = 16
DATASET_PATH = "IMDB_Dataset.csv"
NCLASSES = 2

# Inicializar numpy random seed
np.random.seed(RANDOM_SEED)

# Inicializar torch random seed
torch.manual_seed(RANDOM_SEED)

# Incializar el dispositivo
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cpu


In [13]:
# Cargar el dataset
df = pd.read_csv(DATASET_PATH)
df = df[0:10000]

# Imprimir el dataset
print(df.head(1))

# Imprimir el tamaño del dataset
print(df.shape)

# Imprimir un ejemplo de review
print('\n'.join(wrap(df.review[200])))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
(10000, 2)
Interesting and short television movie describes some of the
machinations surrounding Jay Leno's replacing Carson as host of the
Tonight Show. Film is currently very topical given the public drama
surrounding Conan O'Brien and Jay Leno.<br /><br />The film does a
good job of sparking viewers' interest in the events and showing some
of the concerns of the stakeholders, particularly of the NBC
executives. The portrayal of Ovitz was particularly compelling and
interesting, I thought.<br /><br />Still, many of the characters were
only very briefly limned or touched upon, and some of the acting
seemed perfunctory. Nevertheless, an interesting story.


In [14]:
# Reajustar dataset
df['label'] = (df['sentiment'] == 'positive').astype(int)
df.drop('sentiment', axis=1, inplace=True)
df.head()


Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [15]:
# Modelo preentrenado
PRETRAINED_MODEL_NAME = 'bert-base-uncased'

# Realizamos la tokenización
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [18]:
# Ejemplo de tokenización
sample_txt = 'I really like the movie, it was great!'
print(sample_txt)

# Tokenizar el texto
tokens = tokenizer.tokenize(sample_txt)
print(tokens)

# Convertir tokens a ids
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens_ids)


I really like the movie, it was great!
['i', 'really', 'like', 'the', 'movie', ',', 'it', 'was', 'great', '!']
[1045, 2428, 2066, 1996, 3185, 1010, 2009, 2001, 2307, 999]


In [19]:
# Condificacion para introducir a BERT
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length=10,
    truncation=True,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)

print(encoding)



{'input_ids': tensor([[ 101, 1045, 2428, 2066, 1996, 3185, 1010, 2009, 2001,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}




In [23]:
# Convertir encoding a ids
encoding.keys()

# Obtener los 
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])


['[CLS]', 'i', 'really', 'like', 'the', 'movie', ',', 'it', 'was', '[SEP]']
tensor([ 101, 1045, 2428, 2066, 1996, 3185, 1010, 2009, 2001,  102])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [25]:
# Crear el dataset

class IMDBDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    # Obtener el tamaño del dataset
    def __len__(self):
        return len(self.reviews)
    
    # Obtener un item del dataset
    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        
        # Tokenizar el texto
        encoding = self.tokenizer.encode_plus(
            review,
            max_length=self.max_len,
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [27]:
# Introducir los datos al dataset

def data_loader(df, tokenizer, max_len, batch_size):
    dataset = IMDBDataset(
        reviews=df['review'].to_numpy(),
        labels=df['label'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        dataset=dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
    )


In [29]:
# Dividir el dataset en train y test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

# Crear los dataloaders
train_loader = data_loader(train_df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)
test_loader = data_loader(test_df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)


In [30]:
# Crear el modelo
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRETRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, cls_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        drop_output = self.drop(cls_output)
        linear_output = self.linear(drop_output)
        return linear_output
    

In [31]:
# Crear el modelo
model = SentimentClassifier(n_classes=NCLASSES)
model = model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [32]:
print(model)


SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [36]:
# Entrenar el modelo

# Definir el número de épocas
N_EPOCHS = 3

# Definir la función de pérdida y el optimizador
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * N_EPOCHS

# Crear el scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Definir la función de pérdida
loss_fn = nn.CrossEntropyLoss().to(device)

In [37]:
# Crear la función de entrenamiento
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    # Se recorre el dataloader y se obtiene el input_ids, attention_mask y labels
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Se obtiene el output del modelo
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Se obtiene el predicho por el modelo
        _, preds = torch.max(outputs, dim=1)
        
        # Se convierte el predicho a numpy
        preds = preds.cpu().numpy()
        
        # Se convierte el labels a numpy
        labels = labels.cpu().numpy()
        
        correct_predictions += np.sum(preds == labels)
        
        loss = loss_fn(outputs, labels)
        losses.append(loss.item())
        
        # Se actualiza el modelo
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

In [38]:
# Definir la función de evaluación
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    # Se recorre el dataloader y se obtiene el input_ids, attention_mask y labels
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn(outputs, labels)
            correct_predictions += np.sum(preds == labels)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)


In [None]:
# Entrenar el modelo

for epoch in range(N_EPOCHS):
    print(f'Epoch {epoch + 1} / {N_EPOCHS}')
    print('-' * 10)
    
    train_acc, train_loss = train_model(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_df)
    )
    test_acc, test_loss = eval_model(
        model,
        test_loader,
        loss_fn,
        device,
        len(test_df)
    )
    print(f'Entrenamiento: Train loss {train_loss} accuracy {train_acc}')
    print(f'Evaluación: Test loss {test_loss} accuracy {test_acc}')
    print('-' * 10)

# Guardar el modelo
torch.save(model.state_dict(), 'sentiment_model.pth')



Epoch 1 / 3
----------


In [None]:
# Funcion para predecir
def predict_sentiment(review_text):
    encoding_review = tokenizer.encode_plus(
        review_text,
        max_length=MAX_LEN,
        truncation=True,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding_review['input_ids'].to(device)
    attention_mask = encoding_review['attention_mask'].to(device)
    
    outputs = model(input_ids, attention_mask)
    _, preds = torch.max(outputs, dim=1)
    
    print('\n'.join(wrap(review_text)))
    if preds:
        print(f'Sentimiento predicho: Positivo')
    else:
        print(f'Sentimiento predicho: Negativo')

# Predecir el sentimiento de un review
review_text = df.review[0]
predict_sentiment(review_text)
        
