In [None]:
!pip install transformers

In [None]:
!pip install torchmetrics

In [None]:
import torch
from torch import nn, optim
from transformers import pipeline
from transformers import BertTokenizer, BertModel,  AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchmetrics import F1
import pandas as pd
import numpy as np

In [None]:
!nvidia-smi

In [None]:
RANDOM_SEED = 42
MAX_LEN = 300
BATCH_SIZE=16
NCLASSES= 3

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
url_train = 'ttps://drive.google.com/file/d/1OuZDURMv7uA692UxR2nfgzGUi0qcyw2Y/view?usp=sharing'

url_validation = 'https://drive.google.com/file/d/1kXG0kJl3_0NdnQmvBzYpvBcJEnXAFkcj/view?usp=sharing'

url_test = 'https://drive.google.com/file/d/1DQM2OX-WAqPDsuQutJjgl7ebOKsmn7xp/view?usp=sharing'

def path_download_csv(url):
    return 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

In [None]:
train = pd.read_csv(path_download_csv(url_train))
validation = pd.read_csv(path_download_csv(url_validation))
test = pd.read_csv(path_download_csv(url_test))

In [None]:
test

In [None]:
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

In [None]:
text = train['review'][2]
tokens = tokenizer.tokenize(text)
tokens_id = tokenizer.convert_tokens_to_ids(tokens)
print(text, tokens, tokens_id, sep='\n')

In [None]:
class AmazonDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item]) 
        label = self.labels[item]
        encoding = tokenizer.encode_plus(
                                        review,
                                        max_length=self.max_len,
                                        truncation=True,
                                        add_special_tokens=True,
                                        return_token_type_ids=False,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        return_tensors='pt')
        return{
            'review':review,
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'label':torch.tensor(label, dtype=torch.long)} 

In [None]:
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = AmazonDataset(
    reviews=df.review.to_numpy(),
    labels = df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(dataset, batch_size= BATCH_SIZE, num_workers=2)


In [None]:
train_data_loader = data_loader(train, tokenizer, MAX_LEN, BATCH_SIZE)
validation_data_loader = data_loader(validation, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
class Beto(nn.Module):
    def __init__(self, n_class):
        super(Beto, self).__init__()
        self.bert = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
        self.do = nn.Dropout(0.5)
        self.linear = nn.Linear(self.bert.config.hidden_size, n_class)
    def forward(self, input_ids, attention_mask):
        _, cls_output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            return_dict=False
        )
        dropout = self.do(cls_output)
        output = self.linear(dropout)
        return output


In [None]:
model = Beto(NCLASSES)
model = model.to(device)

In [None]:
EPOCHS=5
optimizer = AdamW (model.parameters(), lr=1e-5, correct_bias=False)
total_steps = len(train_data_loader)*EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps = total_steps
)


In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)
f1 = F1(num_classes=3).to(device)

In [None]:
def train_model (model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model=model.train()
  losses = []
  correct_predictions = 0
  f1_score_global = []
  i = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    f1_score_global.append(f1(preds, labels).cpu().detach().numpy())
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    print('Ejemplo {}/{} , Entrenamiento: Loss: {}, mean f1: {}'.format(i, n_examples/BATCH_SIZE,loss, np.mean(f1_score_global)))
    i+=1
  return correct_predictions.double()/n_examples, np.mean(losses), np.mean(f1_score_global)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples, modo):
  model = model.eval()
  losses = []
  correct_predictions = 0
  f1_score_global = []
  i=0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      f1_score_global.append(f1(preds, labels).cpu().detach().numpy())
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
      print('Ejemplo {}/{} , {}: Loss: {}, mean f1: {}'.format(i, n_examples//BATCH_SIZE, modo, loss, np.mean(f1_score_global)))
      i+=1
  return correct_predictions.double()/n_examples, np.mean(losses), np.mean(f1_score_global)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'El modelo tiene {count_parameters(model)} de parámetros')

In [None]:
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss, train_f1 = train_model(
    model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train)
  )
  validation_acc, validation_loss, validation_f1 = eval_model(
    model, validation_data_loader, loss_fn, device, len(validation), 'Validación'
  )
  checkpoint = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict()}
  torch.save(checkpoint, f'checkpoint_{epoch+1}.pth')
  print('Entrenamiento: Loss: {}, accuracy: {}, f1: {}'.format(train_loss, train_acc, train_f1))
  print('Validación: Loss: {}, accuracy: {}, f1: {}'.format(validation_loss, validation_acc, validation_f1))
  print('')

**<h1>Uso de Checkpoint</h1>**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
cd drive/MyDrive

In [None]:
mkdir checkpoint/

In [None]:
cd checkpoint

In [None]:
ls

In [None]:
loaded_checkpoint = torch.load('Copy of checkpoint_1.pth', map_location=device)

In [None]:
last_epoch = loaded_checkpoint['epoch']
optimizer.load_state_dict(loaded_checkpoint['optimizer'])

In [None]:
optimizer

In [None]:
model.load_state_dict(loaded_checkpoint['state_dict'])

In [None]:
torch.save(loaded_checkpoint['state_dict'],f'Beto_weight{last_epoch}.pth')

In [None]:
for epoch in range(last_epoch, EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss, train_f1 = train_model(
    model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train)
  )
  validation_acc, validation_loss, validation_f1 = eval_model(
    model, validation_data_loader, loss_fn, device, len(validation),  'Validación'
  )
  checkpoint = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict()}
  torch.save(checkpoint, f'checkpoint_{epoch+1}.pth')
  print('Entrenamiento: Loss: {}, accuracy: {}, f1: {}'.format(train_loss, train_acc, train_f1))
  print('Validación: Loss: {}, accuracy: {}, f1: {}'.format(validation_loss, validation_acc, validation_f1))
  print('')

**<h1>Test</h1>**

In [None]:
print('------------------')
test_acc, test_loss, test_f1 = eval_model(
        model, test_data_loader, loss_fn, device, len(test), 'Test'
    )
print('Test: Loss: {}, accuracy: {}, f1: {}'.format(test_loss, test_acc, test_f1))
print('')

In [None]:
def clasificacion_sentimiento(review):
  encoding_review = tokenizer.encode_plus(
      review,
      max_length=MAX_LEN,
      truncation=True,
      add_special_tokens=True,
      return_token_type_ids=False,
      #pad_to_max_length=True,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt'
    )
  
  input_ids=encoding_review['input_ids'].to(device)
  attention_mask=encoding_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output,dim=1)
  if prediction==2:
    print('Sentimiento positivo')
  elif prediction==1:
    print('Sentimiento neutro')  
  elif prediction==0:
    print('Sentimiento negativo')

In [None]:
clasificacion_sentimiento("Excelente experiencia Comida maravillosa con excelentes insumos y perfectas mezclas.") 

In [None]:
clasificacion_sentimiento("Muy buena presentación y servicio sin embargo exageradamente costoso y no se informa ennla carta.") 

In [None]:
clasificacion_sentimiento('Los 330 soles (US$100) peor invertidos. Fui con mi hija a Cabrera de Miraflores, al llegar la reserva los trabajadores no supieron que hacer con lo que solicitamos')

In [None]:
clasificacion_sentimiento("la licuadora en general es buena pero tiene algunos fallos aun asi esta bien creo")

In [None]:
clasificacion_sentimiento('el producto esta ok eso creo')

In [None]:
clasificacion_sentimiento('el producto esta ok eso creo')

In [None]:
clasificacion_sentimiento('genial, esta novela me servira para dormir mejor')