In [None]:
from numpy.distutils.command.install_headers import install_headers
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
# !pip install transformers datasets
!pip install scikit-learn matplotlib seaborn

# 📚 2. Imports

In [None]:
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.metrics import classification_report, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns


# 3. Verificar GPU

In [None]:
print("Versão do PyTorch:", torch.__version__)
print("CUDA disponível:", torch.cuda.is_available())
print("Dispositivo:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


# 📁 4. Dataset

In [None]:
# Exemplo de CSV com colunas: texto,sentimento
df = pd.read_csv("../DATA/sentiment_train.csv")

label_map = {'POSITIVO': 2, 'NEUTRO': 1, 'NEGATIVO': 0}
df = df[df['tweet_text'].isin(label_map.keys())]
df['label'] = df['sentiment'].map(label_map)
df = df[['tweet_text', 'label']].dropna().reset_index(drop=True)

# Divisão dos dados
train_df = df.sample(frac=0.8, random_state=42)
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(val_df.index)


# 🔤 5. Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize(batch):
    return tokenizer(batch["texto"], truncation=True)

dataset_dict = {
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df),
}

tokenized = {
    k: v.map(tokenize, batched=True)
    for k, v in dataset_dict.items()
}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# 🧠 6. Modelo Base

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    num_labels=3
).to("cuda" if torch.cuda.is_available() else "cpu")


# ⚙️ 7. Argumentos de Treinamento

In [None]:
from transformers import TrainingArguments

my_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3
)


# 🧮 8. Métricas

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }


# 🚀 9. Treinamento

In [None]:
trainer = Trainer(
    model=model,
    args=my_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


# ✅ 10. Avaliação Final

In [None]:
predictions = trainer.predict(tokenized["test"])
preds = np.argmax(predictions.predictions, axis=1)

print(classification_report(tokenized["test"]["label"], preds, target_names=['Negativo', 'Neutro', 'Positivo']))


# 📊 11. Matriz de Confusão

In [None]:
cm = torch.tensor(
    pd.crosstab(tokenized["test"]["label"], preds, rownames=['Real'], colnames=['Predito'])
)
sns.heatmap(cm, annot=True, fmt="d", cmap="YlGnBu", xticklabels=['Neg', 'Neu', 'Pos'], yticklabels=['Neg', 'Neu', 'Pos'])
plt.title("Matriz de Confusão")
plt.show()


# 🧾 12. Conclusão
* Modelo BERTimbau ajustado com sucesso.

* Treinamento usando CUDA 12.8 e RTX 3060.

* Próximos passos: análise de erros, ajuste de hiperparâmetros e comparação com baseline.

