In [1]:
pip install transformers



In [2]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap

In [3]:
# Inicialización
RANDOM_SEED = 42
MAX_LEN = 200
BATCH_SIZE = 16 #paquetes
DATASET_PATH = '/content/drive/MyDrive/Datos.csv'
NCLASSES = 3

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
# Cargar dataset
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv(DATASET_PATH)
df = df[0:10000]

Mounted at /content/drive


In [5]:
print(df.head())
print(df.shape)
print("\n".join(wrap(df['review'][0])))

                                  review sentiment
0          Me gusta mucho este producto.  Positivo
1   No estoy seguro de si me gusta o no.    Neutro
2  El servicio al cliente fue excelente.  Positivo
3   No me gustó la calidad del producto.  Negativo
4     La entrega fue rápida y eficiente.  Positivo
(107, 2)
Me gusta mucho este producto.


In [6]:
# Reajustar dataset
## df['label'] = df['sentiment'].map({'Positivo': 1, 'Neutro': 0, 'Negativo': -1})
df['label'] = df['sentiment'].map({'Positivo': 2, 'Neutro': 1, 'Negativo': 0})
df.drop('sentiment', axis=1, inplace=True)
df.head()

Unnamed: 0,review,label
0,Me gusta mucho este producto.,2
1,No estoy seguro de si me gusta o no.,1
2,El servicio al cliente fue excelente.,2
3,No me gustó la calidad del producto.,0
4,La entrega fue rápida y eficiente.,2


In [7]:
# TOKENIZACIÓN
PRE_TRAINED_MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [8]:
# Ejemplo tokenización
sample_txt = 'Yo realmente amo esta película'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)


Frase:  Yo realmente amo esta película
Tokens:  ['Yo', 'realmente', 'amo', 'esta', 'película']
Tokens numéricos:  [30665, 38365, 20142, 11504, 14970]


In [9]:
# Codificación para introducir a BERT
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = 10,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    padding='max_length',
    return_attention_mask = True,
    return_tensors = 'pt'
)

In [10]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [11]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'Yo', 'realmente', 'amo', 'esta', 'película', '[SEP]', '[PAD]', '[PAD]', '[PAD]']
tensor([  101, 30665, 38365, 20142, 11504, 14970,   102,     0,     0,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])


In [12]:
# CREACIÓN DATASET

class DatosDataset(Dataset):

  def __init__(self,reviews,labels,tokenizer,max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        padding='max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
        )


    return {
          'review': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }

In [13]:
# Data loader:

def data_loader(df, tokenizer, max_len, batch_size):
  dataset = DatosDataset(
      reviews = df.review.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 2)

In [14]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)


In [15]:
# EL MODELO

class BERTSentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTSentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3) #evita overfiting
    ##self.linear = nn.Linear(self.bert.config.hidden_size, n_classes) #Añade capa lineal , nuemro de neuronas
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes) #Añade capa lineal , nuemro de neuronas

  def forward(self, input_ids, attention_mask):
    ##_, pooled_output = self.bert(
    bert_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    pooled_output = bert_output.pooler_output ##
    output = self.drop(pooled_output)
    return self.out(output)

In [16]:
model = BERTSentimentClassifier(NCLASSES)
model = model.to(device) #Llevarlo a la GPU

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [17]:
print(model)

BERTSentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [18]:
  # ENTRENAMIENTO
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)




In [19]:
# Iteración entrenamiento
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0

  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())

    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())

  return correct_predictions.double()/n_examples, np.mean(losses)


In [20]:
for batch in train_data_loader:
    # Verificar si 'input_ids', 'attention_mask', y 'label' son tensores
    print('Is input_ids a tensor?', torch.is_tensor(batch['input_ids']))
    print('Is attention_mask a tensor?', torch.is_tensor(batch['attention_mask']))
    print('Is label a tensor?', torch.is_tensor(batch['label']))

    print('Shape of input_ids:', batch['input_ids'].shape)
    print('Shape of attention_mask:', batch['attention_mask'].shape)
    print('Shape of label:', batch['label'].shape)

    labels = batch['label'].to(device)
    print("Type of labels in train_model:", type(labels))
    print("Shape of labels in train_model:", labels.shape)

    break

Is input_ids a tensor? True
Is attention_mask a tensor? True
Is label a tensor? True
Shape of input_ids: torch.Size([16, 200])
Shape of attention_mask: torch.Size([16, 200])
Shape of label: torch.Size([16])
Type of labels in train_model: <class 'torch.Tensor'>
Shape of labels in train_model: torch.Size([16])


In [21]:
# Entrenamiento

for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))
  test_acc, test_loss = eval_model(model, test_data_loader, loss_fn, device, len(df_test))
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 5
------------------
Entrenamiento: Loss: 1.0649518569310505, accuracy: 0.38823529411764707
Validación: Loss: 0.9973630905151367, accuracy: 0.4545454545454546

Epoch 2 de 5
------------------
Entrenamiento: Loss: 0.8171472549438477, accuracy: 0.5764705882352941
Validación: Loss: 0.5917802900075912, accuracy: 0.6818181818181819

Epoch 3 de 5
------------------
Entrenamiento: Loss: 0.3873760203520457, accuracy: 0.8352941176470589
Validación: Loss: 0.44691014289855957, accuracy: 0.8181818181818182

Epoch 4 de 5
------------------
Entrenamiento: Loss: 0.16155137680470943, accuracy: 0.9647058823529412
Validación: Loss: 0.3730474263429642, accuracy: 0.8181818181818182

Epoch 5 de 5
------------------
Entrenamiento: Loss: 0.06679400367041428, accuracy: 0.9882352941176471
Validación: Loss: 0.2607809379696846, accuracy: 0.9090909090909092



In [22]:
def classifySentiment(review_text):
  encoding_review = tokenizer.encode_plus(
      review_text,
      max_length = MAX_LEN,
      truncation = True,
      add_special_tokens = True,
      return_token_type_ids = False,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt'
      )

  input_ids = encoding_review['input_ids'].to(device)
  attention_mask = encoding_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  print("\n".join(wrap(review_text)))
  if prediction == 0:
    print('Sentimiento predicho: megativo')
  elif prediction == 1:
    print('Sentimiento predicho: neutro')
  elif prediction == 2:
    print('Sentimiento predicho: positivo')

In [23]:
review_text = "No estoy ni en contra ni a favor del congreso"

classifySentiment(review_text)

No estoy ni en contra ni a favor del congreso
Sentimiento predicho: neutro


