In [None]:
pip install transformers



In [1]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap

In [63]:
# Inicialización
RANDOM_SEED = 42
MAX_LEN = 200
BATCH_SIZE = 16 #paquetes
DATASET_PATH = '/content/drive/MyDrive/Datos.csv'
NCLASSES = 3

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [64]:
# Cargar dataset
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv(DATASET_PATH)
df = df[0:10000]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
print(df.head())
print(df.shape)
print("\n".join(wrap(df['Comentario'][0])))

                              Comentario Sentimiento
0          Me gusta mucho este producto.    Positivo
1   No estoy seguro de si me gusta o no.      Neutro
2  El servicio al cliente fue excelente.    Positivo
3   No me gustó la calidad del producto.    Negativo
4     La entrega fue rápida y eficiente.    Positivo
(107, 2)
Me gusta mucho este producto.


In [66]:
# Reajustar dataset
df['label'] = df['Sentimiento'].map({'Positivo': 1, 'Neutro': 0, 'Negativo': -1})
df.drop('Sentimiento', axis=1, inplace=True)
df.head()


Unnamed: 0,Comentario,label
0,Me gusta mucho este producto.,1
1,No estoy seguro de si me gusta o no.,0
2,El servicio al cliente fue excelente.,1
3,No me gustó la calidad del producto.,-1
4,La entrega fue rápida y eficiente.,1


In [67]:
# TOKENIZACIÓN
PRE_TRAINED_MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [68]:
# Ejemplo tokenización
sample_txt = 'Yo realmente amo esta película'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)


Frase:  Yo realmente amo esta película
Tokens:  ['Yo', 'realmente', 'amo', 'esta', 'película']
Tokens numéricos:  [30665, 38365, 20142, 11504, 14970]


In [69]:
# Codificación para introducir a BERT
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = 10,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    pad_to_max_length = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)



In [70]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [71]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'Yo', 'realmente', 'amo', 'esta', 'película', '[SEP]', '[PAD]', '[PAD]', '[PAD]']
tensor([  101, 30665, 38365, 20142, 11504, 14970,   102,     0,     0,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])


In [72]:
# CREACIÓN DATASET

class DatosDataset(Dataset):

  def __init__(self,reviews,labels,tokenizer,max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )


    return {
          'review': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }

In [73]:
# Data loader:

def data_loader(df, tokenizer, max_len, batch_size):
  dataset = DatosDataset(
      reviews = df.Comentario.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

In [76]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)


In [77]:
# EL MODELO

class BERTSentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTSentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3) #evita overfiting
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes) #Añade capa lineal , nuemro de neuronas

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [81]:
model = BERTSentimentClassifier(NCLASSES)
model = model.to(device) #Llevarlo a la GPU

In [82]:
#print(model)

BERTSentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [83]:
  # ENTRENAMIENTO
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)


