In [None]:
!pip install transformers
!pip install torchmetrics



In [None]:
import torch
from torch import nn, optim
from transformers import pipeline
from transformers import BertTokenizer, BertModel,  AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torchmetrics import F1
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


In [None]:
url_train = 'ttps://drive.google.com/file/d/1OuZDURMv7uA692UxR2nfgzGUi0qcyw2Y/view?usp=sharing'

url_validation = 'https://drive.google.com/file/d/1kXG0kJl3_0NdnQmvBzYpvBcJEnXAFkcj/view?usp=sharing'

url_test = 'https://drive.google.com/file/d/1DQM2OX-WAqPDsuQutJjgl7ebOKsmn7xp/view?usp=sharing'

def path_download_csv(url):
    return 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

In [None]:
RANDOM_SEED = 42
MAX_LEN = 200
BATCH_SIZE=16
NCLASSES= 3

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
train = pd.read_csv('train.csv')
validation = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

In [None]:
test

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
text = train['review'][3]
tokens = tokenizer.tokenize(text)
tokens_id = tokenizer.convert_tokens_to_ids(tokens)
print(text, tokens, tokens_id, sep='\n')

In [None]:
class AmazonDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item]) 
        label = self.labels[item]
        encoding = tokenizer.encode_plus(
                                        review,
                                        max_length=self.max_len,
                                        truncation=True,
                                        add_special_tokens=True,
                                        return_token_type_ids=False,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        return_tensors='pt')
        return{
            'review':review,
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'label':torch.tensor(label, dtype=torch.long)} 

In [None]:
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = AmazonDataset(
    reviews=df.review.to_numpy(),
    labels = df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(dataset, batch_size= BATCH_SIZE, num_workers=2)


In [None]:
train_data_loader = data_loader(train, tokenizer, MAX_LEN, BATCH_SIZE)
validation_data_loader = data_loader(validation, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
class MultilingualBert(nn.Module):
    def __init__(self, n_class):
        super(MultilingualBert, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.do = nn.Dropout(0.5)
        self.linear = nn.Linear(self.bert.config.hidden_size, n_class)

    def forward(self, input_ids, attention_mask):
        _, cls_output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            return_dict=False
        )
        dropout = self.do(cls_output, )
        output = self.linear(dropout)
        return output


In [None]:
model = MultilingualBert(NCLASSES)
model = model.to(device)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
count_parameters(model)

177855747

In [None]:
EPOCHS=5
optimizer = AdamW (model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader)*EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps = total_steps
)

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)
f1 = F1(num_classes=3).to(device)

In [None]:
def train_model (model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model=model.train()
  losses = []
  correct_predictions = 0
  f1_score_global = []
  i=0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    f1_score_global.append(f1(preds, labels).cpu().detach().numpy())
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    print('Ejemplo {}/{} , Entrenamiento: Loss: {}, mean f1: {}'.format(i, n_examples/BATCH_SIZE,loss, np.mean(f1_score_global)))
    i+=1
  return correct_predictions.double()/n_examples, np.mean(losses), np.mean(f1_score_global)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples, modo):
  model = model.eval()
  losses = []
  correct_predictions = 0
  f1_score_global = []
  i=0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      f1_score_global.append(f1(preds, labels).cpu().detach().numpy())
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
      print('Ejemplo {}/{} , {}: Loss: {}, mean f1: {}'.format(i, n_examples//BATCH_SIZE, modo,loss, np.mean(f1_score_global)))
      i+=1
  return correct_predictions.double()/n_examples, np.mean(losses), np.mean(f1_score_global)

si van a usar en el entremiento de largo usar este for de aquí abajo pero si se cuelga el colab ir ala sig sección de checkpoit

In [None]:
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss, train_f1 = train_model(
    model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train)
  )
  validation_acc, validation_loss, validation_f1 = eval_model(
    model, validation_data_loader, loss_fn, device, len(validation), 'Validación'
  )  
  checkpoint = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict()}
  torch.save(checkpoint, f'checkpoint_{epoch+1}.pth')
  print('Entrenamiento: Loss: {}, accuracy: {}, f1: {}'.format(train_loss, train_acc, train_f1))
  print('Validación: Loss: {}, accuracy: {}, f1: {}'.format(validation_loss, validation_acc, validation_f1))
  print('')

[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
Ejemplo 7224/7500.0 , Entrenamiento: Loss: 0.5844606757164001, mean f1: 0.5186764597892761
Ejemplo 7225/7500.0 , Entrenamiento: Loss: 0.7076435685157776, mean f1: 0.5186998248100281
Ejemplo 7226/7500.0 , Entrenamiento: Loss: 0.7351663708686829, mean f1: 0.5187145471572876
Ejemplo 7227/7500.0 , Entrenamiento: Loss: 0.7225513458251953, mean f1: 0.5187206268310547
Ejemplo 7228/7500.0 , Entrenamiento: Loss: 0.7022421956062317, mean f1: 0.5187352895736694
Ejemplo 7229/7500.0 , Entrenamiento: Loss: 0.7040638327598572, mean f1: 0.5187586545944214
Ejemplo 7230/7500.0 , Entrenamiento: Loss: 0.6068998575210571, mean f1: 0.5187992453575134
Ejemplo 7231/7500.0 , Entrenamiento: Loss: 0.6109268069267273, mean f1: 0.518813967704773
Ejemplo 7232/7500.0 , Entrenamiento: Loss: 0.7627193927764893, mean f1: 0.5188286304473877
Ejemplo 7233/7500.0 , Entrenamiento: Loss: 0.8848233819007874, mean f1: 0.5188173651695251
Ejemplo 72

**<h1>Usar checkpoints</h1>**

In [None]:
loaded_checkpoint = torch.load('checkpoint_1.pth', map_location=device)

In [None]:
last_epoch = loaded_checkpoint['epoch']
optimizer.load_state_dict(loaded_checkpoint['optimizer'])

In [None]:
model.load_state_dict(loaded_checkpoint['state_dict'])

<All keys matched successfully>

In [None]:
torch.save(loaded_checkpoint['state_dict'],f'MBert_weight{last_epoch}.pth')

In [None]:
for epoch in range(last_epoch, EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss, train_f1 = train_model(
    model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train)
  )
  validation_acc, validation_loss, validation_f1 = eval_model(
    model, validation_data_loader, loss_fn, device, len(validation), 'Validación'
  )
  checkpoint = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict()}
  torch.save(checkpoint, f'checkpoint_{epoch+1}.pth')
  print('Entrenamiento: Loss: {}, accuracy: {}, f1: {}'.format(train_loss, train_acc, train_f1))
  print('Validación: Loss: {}, accuracy: {}, f1: {}'.format(validation_loss, validation_acc, validation_f1))
  print('')

**<h1>Test</h1>**

In [None]:
print('------------------')
test_acc, test_loss, test_f1 = eval_model(
        model, test_data_loader, loss_fn, device, len(test), 'Test'
    )
print('Test: Loss: {}, accuracy: {}, f1: {}'.format(test_loss, test_acc, test_f1))
print('')

------------------
Ejemplo 0/164 , Test: Loss: 0.7878628969192505, mean f1: 0.6875
Ejemplo 1/164 , Test: Loss: 0.8221923112869263, mean f1: 0.59375
Ejemplo 2/164 , Test: Loss: 0.6611753702163696, mean f1: 0.625
Ejemplo 3/164 , Test: Loss: 0.9754965901374817, mean f1: 0.609375
Ejemplo 4/164 , Test: Loss: 0.6439922451972961, mean f1: 0.6499999761581421
Ejemplo 5/164 , Test: Loss: 0.6246194243431091, mean f1: 0.65625
Ejemplo 6/164 , Test: Loss: 0.6603860855102539, mean f1: 0.6696428656578064
Ejemplo 7/164 , Test: Loss: 0.9931719303131104, mean f1: 0.65625
Ejemplo 8/164 , Test: Loss: 1.2937101125717163, mean f1: 0.6458333134651184
Ejemplo 9/164 , Test: Loss: 0.6007692217826843, mean f1: 0.643750011920929
Ejemplo 10/164 , Test: Loss: 0.6174858808517456, mean f1: 0.6590909361839294
Ejemplo 11/164 , Test: Loss: 0.975482702255249, mean f1: 0.6354166865348816
Ejemplo 12/164 , Test: Loss: 0.691209077835083, mean f1: 0.6346153616905212
Ejemplo 13/164 , Test: Loss: 0.4907473623752594, mean f1: 0.6

In [None]:
def clasificacion_sentimiento(review):
  encoding_review = tokenizer.encode_plus(
      review,
      max_length=MAX_LEN,
      truncation=True,
      add_special_tokens=True,
      return_token_type_ids=False,
      #pad_to_max_length=True,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt'
    )
  
  input_ids=encoding_review['input_ids'].to(device)
  attention_mask=encoding_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output,dim=1)
  if prediction==2:
    print('Sentimiento positivo')
  elif prediction==1:
    print('Sentimiento neutro')  
  elif prediction==0:
    print('Sentimiento negativo')

In [None]:
clasificacion_sentimiento("El servicio es rápido, servicio invisible y clasificados como categoría un difícil equilibrio perfecto pero como debería ser. Como de 2012 Nov el lugar normalmente está lleno, pero debido a su (surprinsingly) tamaño grande (por lo que sigue siendo acogedor en todas sus diferentes salas de comedor) una pareja o un grupo de 4 puede conseguir con en más de 10 minutos de espera en cualquier momento.")