<a href="https://colab.research.google.com/github/caanpaip/GenAI-MBA/blob/master/Notebooks/03.BERT_Sentiment_Analyzer__generated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install loguru
# !pip install cloud-tpu-client==0.10 torch==1.13.0



In [None]:
%load_ext autoreload
%autoreload 2
import platform
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup #  AdamW,
from torch.optim import AdamW
from loguru import logger

# 01. Defining the device

In [None]:
try:
  ## TPU
  import torch_xla
  import torch_xla.core.xla_model as xm
  # device = xm.xla_device()
except:
  ## GPU ou CPU
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device


device(type='cuda')

# 02. Load Data

In [None]:
if platform.system().lower()=="linux":

  ## obtendo do google Drive
  from google.colab import drive
  drive.mount('/content/drive')

  sample_size = 10

  sample_1 = pd.read_parquet(f"/content/drive/MyDrive/MBA/sample_{sample_size}_Meli.parquet")
  print(sample_1.shape)
else:
  sample_1 = pd.read_parquet("./../Dados/sample_1_Meli.parquet")
  print(sample_1.shape)

Mounted at /content/drive
(9799, 20)


In [None]:
df = sample_1[['content', 'sentiment']]

# 03. Load model

In [None]:
# Definir o tokenizer e o modelo pré-treinado
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 04. Prepare data to model

In [None]:
# Dividir o dataset em treino e teste
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['sentiment'], random_state=42)

In [None]:
# Criar uma classe Dataset personalizada
class SentimentDataset(Dataset):
    """ Custom dataset class for sentiment analysis.

    args:
        dataframe: pd.DataFrame, dataframe with 'content' and 'sentiment' columns
        tokenizer: transformers tokenizer object
        max_len: int, maximum length of the input text

    return:
        dictionary with 'review_text', 'input_ids', 'attention_mask', 'labels'

    """


    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        review = str(self.dataframe.iloc[idx, 0])
        label = self.dataframe.iloc[idx, 1]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
## To compute the maximum length of the tokens in the reviews

len_tokens = []

for review in sample_1['content']:

    tokens = tokenizer.tokenize(review)

    len_tokens.append(len(tokens))

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Parâmetros
BATCH_SIZE = 16
MAX_LEN = max(len_tokens)
EPOCHS = 6
LEARNING_RATE = 2e-5

# Criar DataLoaders
train_dataset = SentimentDataset(train_df, tokenizer, MAX_LEN)
val_dataset = SentimentDataset(val_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Definir otimizador e scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Definir dispositivo
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# 05. Define train and validate Functions

In [None]:
# Função de treinamento
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(data_loader)


In [None]:
# Função de avaliação
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(data_loader)

# 06. Train model

In [None]:
# Treinamento e avaliação
## Usando GPU com amostra de 1% demora 1 minuto por epoc e na CPU para este mesmo demora 17 minutos

for epoch in range(EPOCHS):
    logger.info(f'Epoch {epoch + 1}/{EPOCHS}')


    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_loader, device)
    print(f'Validation loss {val_loss} accuracy {val_acc}')
    print('-' * 10 + "\n")

# 07. Evalute the model

In [None]:
label_dict = {'positivo': 2, 'neutro': 1, 'negativo': 0}

In [None]:
# Avaliação final
y_review_texts = []
y_pred = []
y_true = []

model.eval()
with torch.no_grad():
    for batch in val_loader:
        texts = batch['review_text']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        y_review_texts.extend(texts)
        y_pred.extend(preds)
        y_true.extend(labels)


if torch.cuda.is_available():
  y_true = [tensor_label.cpu().numpy() for tensor_label in y_true]
  y_pred = [tensor_label.cpu().numpy() for tensor_label in y_pred]

print('Classification Report:')
print(classification_report(y_true, y_pred, target_names=label_dict.keys()))

Classification Report:
              precision    recall  f1-score   support

    positivo       0.67      0.62      0.64       321
      neutro       0.49      0.49      0.49       302
    negativo       0.73      0.78      0.76       357

    accuracy                           0.64       980
   macro avg       0.63      0.63      0.63       980
weighted avg       0.64      0.64      0.64       980



# 08. Salvando

In [None]:
torch.save(model.state_dict(), f'/content/drive/MyDrive/MBA/BERT_sentimentos_classifier__sample{sample_size}.pth')


# 09. Load train model

In [None]:
# Crie uma instância do modelo com a mesma arquitetura
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=3)

# Carregue o estado do modelo salvo
model.load_state_dict(torch.load(f'/content/drive/MyDrive/MBA/BERT_sentimentos_classifier__sample{sample_size}.pth'))

# Mova o modelo para o dispositivo (CPU, GPU ou TPU)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
label_dict = {'positivo': 2, 'neutro': 1, 'negativo': 0}

## Load model: Score Eval

In [None]:
# Avaliação final
y_review_texts = []
y_pred = []
y_true = []

model.eval()
with torch.no_grad():
    for batch in val_loader:
        texts = batch['review_text']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        y_review_texts.extend(texts)
        y_pred.extend(preds)
        y_true.extend(labels)


if torch.cuda.is_available():
  y_true = [tensor_label.cpu().numpy() for tensor_label in y_true]
  y_pred = [tensor_label.cpu().numpy() for tensor_label in y_pred]

print('Classification Report:')
print(classification_report(y_true, y_pred, target_names=label_dict.keys()))

Classification Report:
              precision    recall  f1-score   support

    positivo       0.67      0.62      0.64       321
      neutro       0.49      0.49      0.49       302
    negativo       0.73      0.78      0.76       357

    accuracy                           0.64       980
   macro avg       0.63      0.63      0.63       980
weighted avg       0.64      0.64      0.64       980



## Load model: Score Train

In [None]:
# Avaliação Train
y_review_texts = []
y_pred = []
y_true = []

model.eval()
with torch.no_grad():
    for batch in train_loader:
        texts = batch['review_text']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        y_review_texts.extend(texts)
        y_pred.extend(preds)
        y_true.extend(labels)


if torch.cuda.is_available():
  y_true = [tensor_label.cpu().numpy() for tensor_label in y_true]
  y_pred = [tensor_label.cpu().numpy() for tensor_label in y_pred]

print('Classification Report:')
print(classification_report(y_true, y_pred, target_names=label_dict.keys()))

Classification Report:
              precision    recall  f1-score   support

    positivo       0.99      0.97      0.98      2887
      neutro       0.97      0.92      0.94      2723
    negativo       0.93      0.98      0.95      3209

    accuracy                           0.96      8819
   macro avg       0.96      0.96      0.96      8819
weighted avg       0.96      0.96      0.96      8819

