## 1 - Instalação e limpeza

In [1]:
# Desinstala bibliotecas relacionadas ao CUDA para evitar conflitos
!pip uninstall -y cudf-cu12 pylibcudf-cu12 dask-cudf-cu12 || true
# Limpa o cache do pip
!pip cache purge || true
# Remove diretórios de cache do pip e arquivos temporários
!rm -rf /root/.cache/pip /tmp/pip-install-* || true

# Instala as bibliotecas necessárias para o projeto
!pip install -q "transformers>=4.30" "datasets>=2.13" "evaluate>=0.4" "accelerate" "wandb" "sentencepiece" "huggingface_hub<1.0,>=0.34.0"

Found existing installation: cudf-cu12 25.6.0
Uninstalling cudf-cu12-25.6.0:
  Successfully uninstalled cudf-cu12-25.6.0
Found existing installation: pylibcudf-cu12 25.6.0
Uninstalling pylibcudf-cu12-25.6.0:
  Successfully uninstalled pylibcudf-cu12-25.6.0
Found existing installation: dask-cudf-cu12 25.6.0
Uninstalling dask-cudf-cu12-25.6.0:
  Successfully uninstalled dask-cudf-cu12-25.6.0
[0mFiles removed: 0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2 - Configurações globais

In [3]:
import os
import random
import numpy as np
import torch

# Configs
LOCAL_DIR = "/content/data_hate_speech_portuguese"
DATASET_DIVIDED_DIR = os.path.join(LOCAL_DIR, "dataset_dividido")
csv_url = "https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_binary_classification.csv"
SEED = 42

# Nome do modelo pré-treinado a ser utilizado
MODEL_NAME = "adalbertojunior/distilbert-portuguese-cased"

# Hiperparâmetros (ajuste conforme o runtime e hardware disponíveis)
NUM_EPOCHS = 3 # Número de épocas de treinamento
PER_DEVICE_TRAIN_BATCH_SIZE = 8   # Tamanho do batch para treinamento por dispositivo (reduzir se houver erro de memória - 4 ou 2)
PER_DEVICE_EVAL_BATCH_SIZE = 32 # Tamanho do batch para avaliação por dispositivo
LEARNING_RATE = 2e-5 # Taxa de aprendizado
WEIGHT_DECAY = 0.01 # Decaimento de peso para regularização
FP16 = True if torch.cuda.is_available() else False # Usa mixed precision se GPU estiver disponível

# Configurações do Weights & Biases (W&B) para rastreamento de experimentos
USE_WANDB = True   # Define como False para desativar completamente o W&B
PROJECT_NAME = "bert-portuguese-hatespeech" # Nome do projeto no W&B
BASELINE_RUN_NAME = "baseline-distilbert-portuguese" # Nome da execução para o baseline no W&B
RUN_NAME = "finetune-distilbert-portuguese" # Nome da execução para o fine-tuning no W&B

# Define a semente para garantir a reprodutibilidade
from transformers import set_seed
set_seed(SEED)
# Verifica e imprime o dispositivo utilizado (GPU ou CPU)
print("Device:", "cuda" if torch.cuda.is_available() else "cpu")

Device: cpu


## 3 - Carregamento, normalizar label e divisão do Dataset

In [4]:
# Preparação simples e direta para o CSV que você descreveu
import os
from datasets import load_dataset, DatasetDict, ClassLabel



os.makedirs(LOCAL_DIR, exist_ok=True)
local_file = os.path.join(LOCAL_DIR, os.path.basename(csv_url))

# 1) Baixa CSV se necessário (se já tiver ds_full carregado, ignore essa parte)
if not os.path.exists(local_file):
    import requests
    print("Baixando CSV...")
    r = requests.get(csv_url, timeout=60)
    r.raise_for_status()
    open(local_file, "wb").write(r.content)
    print("Salvo em", local_file)
else:
    print("Arquivo já existe:", local_file)

# 2) Carrega dataset (ou use o ds_full se já estiver na memória)
ds_full = load_dataset("csv", data_files=local_file, split="train")
print("Colunas:", ds_full.column_names)
print("Exemplo:", ds_full[0])

# 3) Mapear 'hatespeech_comb' para strings 'no-hate' / 'hate'
# (assume que coluna existe e contém 0,1 ou 1.0)
def map_label(ex):
    v = ex["hatespeech_comb"]
    # tratar floats / strings também
    try:
        vi = int(float(v))
    except Exception:
        vi = 1 if (isinstance(v, str) and "1" in v) else 0
    ex["label"] = "hate" if vi == 1 else "no-hate"
    return ex

ds_full = ds_full.map(map_label)

# 4) Mantemos 'text' como coluna de texto (já existe no CSV)
# 5) Cast para ClassLabel (ordem: no-hate, hate)
ds_full = ds_full.cast_column("label", ClassLabel(names=["no-hate", "hate"]))

print("Label feature:", ds_full.features["label"])
print("Exemplo pós-mapeamento:", ds_full[0])

# 6) Splits estratificados 80/10/10
train_test = ds_full.train_test_split(test_size=0.2, seed=SEED, stratify_by_column="label")
train_val = train_test["train"].train_test_split(test_size=0.2, seed=SEED, stratify_by_column="label")
dataset_dividido = DatasetDict({
    "train": train_val["train"],
    "val": train_val["test"],
    "test": train_test["test"]
})
print("Tamanhos:", {k: len(dataset_dividido[k]) for k in dataset_dividido})

# 7) Salva em disco para reuso (tokenização etc.)
os.makedirs(DATASET_DIVIDED_DIR, exist_ok=True)
dataset_dividido.save_to_disk(DATASET_DIVIDED_DIR)
print("Dataset dividido salvo em:", DATASET_DIVIDED_DIR)


Baixando CSV...
Salvo em /content/data_hate_speech_portuguese/2019-05-28_portuguese_hate_speech_binary_classification.csv


Generating train split: 0 examples [00:00, ? examples/s]

Colunas: ['text', 'hatespeech_comb', 'hatespeech_G1', 'annotator_G1', 'hatespeech_G2', 'annotator_G2', 'hatespeech_G3', 'annotator_G3']
Exemplo: {'text': '@__andrea__b \nO cara vive em outro mundo\nNão no mundo real\nREFUGIADOS são os que vivem\nNas favelas vizinhas as suas fortalezas', 'hatespeech_comb': 1, 'hatespeech_G1': 1, 'annotator_G1': 'A', 'hatespeech_G2': 1.0, 'annotator_G2': 'V', 'hatespeech_G3': 0, 'annotator_G3': 'E'}


Map:   0%|          | 0/5670 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5670 [00:00<?, ? examples/s]

Label feature: ClassLabel(names=['no-hate', 'hate'])
Exemplo pós-mapeamento: {'text': '@__andrea__b \nO cara vive em outro mundo\nNão no mundo real\nREFUGIADOS são os que vivem\nNas favelas vizinhas as suas fortalezas', 'hatespeech_comb': 1, 'hatespeech_G1': 1, 'annotator_G1': 'A', 'hatespeech_G2': 1.0, 'annotator_G2': 'V', 'hatespeech_G3': 0, 'annotator_G3': 'E', 'label': 1}
Tamanhos: {'train': 3628, 'val': 908, 'test': 1134}


Saving the dataset (0/1 shards):   0%|          | 0/3628 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/908 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1134 [00:00<?, ? examples/s]

Dataset dividido salvo em: /content/data_hate_speech_portuguese/dataset_dividido


## 4 - Tokenização (train/val/test)

In [5]:
# Tokenização enxuta para train/validation/test (assume coluna "text" e coluna "label")
from datasets import load_from_disk
from transformers import AutoTokenizer

# Carrega dataset dividido
ds = load_from_disk(DATASET_DIVIDED_DIR)

# Normaliza chave "val" → "validation" (caso exista)
if "val" in ds and "validation" not in ds:
    ds["validation"] = ds["val"]

# Usa a coluna 'text' (já existe no seu CSV)
text_col = "text"

# Carrega tokenizador
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("Tokenizer:", tokenizer.__class__.__name__)

# Função de tokenização (sem padding — o DataCollator fará padding dinamicamente)
def preprocess(batch):
    return tokenizer(batch[text_col], truncation=True, padding=False, max_length=128)

# Tokeniza cada split e remove colunas originais
tokenized_train = ds["train"].map(preprocess, batched=True, remove_columns=ds["train"].column_names)
tokenized_val   = ds["validation"].map(preprocess, batched=True, remove_columns=ds["validation"].column_names)
tokenized_test  = ds["test"].map(preprocess, batched=True, remove_columns=ds["test"].column_names)

# Adiciona coluna 'labels' esperada pelo Trainer
tokenized_train = tokenized_train.add_column("labels", ds["train"]["label"])
tokenized_val   = tokenized_val.add_column("labels",   ds["validation"]["label"])
tokenized_test  = tokenized_test.add_column("labels",  ds["test"]["label"])

print("Tokenização concluída. Exemplos:")
print(tokenized_train[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Tokenizer: BertTokenizerFast


Map:   0%|          | 0/3628 [00:00<?, ? examples/s]

Map:   0%|          | 0/908 [00:00<?, ? examples/s]

Map:   0%|          | 0/1134 [00:00<?, ? examples/s]

Tokenização concluída. Exemplos:
{'input_ids': [101, 257, 22321, 137, 4127, 2308, 3471, 22278, 168, 174, 22292, 22295, 131, 533, 6647, 470, 407, 1151, 319, 2924, 123, 4720, 125, 20467, 14093, 119, 787, 978, 230, 3138, 3002, 8497, 22278, 125, 10161, 170, 123, 13112, 119, 119, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 1}


## 5: Baseline: predizer só val com o modelo indicado e calcular F1 (evaluate) — e log no W&B

In [6]:
# Baseline simples: predição em validation + F1 (macro, weighted); opcional W&B
import numpy as np, json, os, torch
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
import evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Detectar num_labels de forma simples
num_labels = None
feat = tokenized_train.features.get("labels")
if getattr(feat, "num_classes", None) is not None:
    num_labels = feat.num_classes
else:
    try:
        num_labels = len(set(tokenized_train["labels"]))
    except Exception:
        num_labels = 2
print("num_labels =", num_labels)

# 2) Carregar modelo (cria cabeça nova se necessário) e mover p/ device
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.to(device)

# 3) Preparar Trainer apenas para predição
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
args = TrainingArguments(output_dir="tmp_pred", per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE, report_to=[])
trainer = Trainer(model=model, args=args, tokenizer=tokenizer, data_collator=data_collator)

# 4) Fazer predição na validation
print("Predicting on validation...")
pred_out = trainer.predict(tokenized_val)
logits = pred_out.predictions
if logits is None:
    raise RuntimeError("No logits returned by prediction.")

# robustez: suportar tuplas / várias formas
if isinstance(logits, tuple):
    logits = logits[0]
logits = np.asarray(logits)
if logits.ndim == 1:
    preds = logits.astype(int)
else:
    preds = np.argmax(logits, axis=-1)

labels = pred_out.label_ids

# 5) Calcular F1s com evaluate
f1 = evaluate.load("f1")
f1_macro = f1.compute(references=labels, predictions=preds, average="macro")["f1"]
f1_weighted = f1.compute(references=labels, predictions=preds, average="weighted")["f1"]
print(f"Baseline F1 macro: {f1_macro:.6f}  |  F1 weighted: {f1_weighted:.6f}")

# 6) Salvar resultado localmente
baseline = {"model": MODEL_NAME, "f1_macro": float(f1_macro), "f1_weighted": float(f1_weighted)}
with open("baseline_val_results.json", "w") as f:
    json.dump(baseline, f, indent=2)
print("Saved baseline_val_results.json")

# 7) Opcional: log no W&B (se habilitado e com API key)
if USE_WANDB:
    if not os.environ.get("WANDB_API_KEY"):
        key = input("Cole sua WANDB API KEY (ou Enter para pular): ").strip()
        if key:
            os.environ["WANDB_API_KEY"] = key

    if os.environ.get("WANDB_API_KEY"):
        import wandb, pandas as pd
        from sklearn.metrics import confusion_matrix
        import matplotlib.pyplot as plt

        run = wandb.init(project=PROJECT_NAME, name=BASELINE_RUN_NAME, reinit=True)
        run.log({"baseline/f1_macro": float(f1_macro), "baseline/f1_weighted": float(f1_weighted)})

        # amostra de predições para inspeção
        sample_n = min(200, len(tokenized_val))
        texts = [ds["validation"][i][text_col] for i in range(sample_n)]
        df = pd.DataFrame({"text": texts, "label": labels[:sample_n].tolist(), "pred": preds[:sample_n].tolist()})
        run.log({"baseline/sample_predictions": wandb.Table(dataframe=df)})

        # matriz de confusão (plot simples)
        try:
            cm = confusion_matrix(labels, preds)
            fig, ax = plt.subplots(figsize=(4,4))
            ax.imshow(cm)
            ax.set_xlabel("pred"); ax.set_ylabel("true")
            for (j,i),val in np.ndenumerate(cm):
                ax.text(i,j,val,ha='center',va='center')
            run.log({"baseline/confusion_matrix": wandb.Image(fig)})
            plt.close(fig)
        except Exception as e:
            print("Confusion matrix failed:", e)

        run.finish()
        print("Logged baseline to W&B.")
    else:
        print("W&B API key not set — skipped logging.")
else:
    print("W&B disabled.")


num_labels = 2


model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at adalbertojunior/distilbert-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args, tokenizer=tokenizer, data_collator=data_collator)


Predicting on validation...


Downloading builder script: 0.00B [00:00, ?B/s]

Baseline F1 macro: 0.371382  |  F1 weighted: 0.332698
Saved baseline_val_results.json
Cole sua WANDB API KEY (ou Enter para pular): dfad39380e440f12090faccab19a6dec310fd1cb


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mahmiura[0m ([33mahmiura-tutoria[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
baseline/f1_macro,▁
baseline/f1_weighted,▁

0,1
baseline/f1_macro,0.37138
baseline/f1_weighted,0.3327


Logged baseline to W&B.


## 6 - Fine-tuning com Trainer (W&B logging automático)

In [7]:
# Fine-tuning enxuto e com ajustes de desempenho
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)
import numpy as np, os, torch
from sklearn.metrics import f1_score, accuracy_score

# opcional: grad accumulation (ajuste se quiser simular batch maior)
GRADIENT_ACCUMULATION_STEPS = globals().get("GRADIENT_ACCUMULATION_STEPS", 1)

# 1) detectar num_labels (simples e robusto)
feat = tokenized_train.features.get("labels")
if getattr(feat, "num_classes", None) is not None:
    num_labels = feat.num_classes
else:
    try:
        num_labels = len(set(tokenized_train["labels"]))
    except Exception:
        num_labels = 2
print("num_labels =", num_labels)

# 2) carregar modelo (já cria cabeça nova se necessário) e mover para device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.to(device)

# 3) TrainingArguments — compactos e com opções de desempenho
training_args = TrainingArguments(
    output_dir="runs/fine_tune",
    eval_strategy="epoch",                     # avaliar por época
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    dataloader_num_workers=4,                  # melhora throughput na CPU/IO
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=FP16 and torch.cuda.is_available(),   # use mixed precision se disponível
    report_to=["wandb"] if (USE_WANDB and os.environ.get("WANDB_API_KEY")) else [],
    run_name=RUN_NAME,
    save_total_limit=3,
)

# compute_metrics usando Hugging Face evaluate (F1)
import evaluate, numpy as np
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    logits = np.asarray(logits)
    preds = np.argmax(logits, axis=-1) if logits.ndim > 1 else logits.astype(int)
    f1_macro = f1_metric.compute(references=labels, predictions=preds, average="macro")["f1"]
    f1_weighted = f1_metric.compute(references=labels, predictions=preds, average="weighted")["f1"]
    acc = (preds == labels).mean()
    return {"f1_macro": float(f1_macro), "f1_weighted": float(f1_weighted), "accuracy": float(acc)}


# 5) Trainer (data collator faz padding dinâmico)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# 6) opcional: iniciar W&B manualmente para incluir config (Trainer também loga)
if USE_WANDB and os.environ.get("WANDB_API_KEY"):
    import wandb
    wandb.init(project=PROJECT_NAME, name=RUN_NAME, config={
        "model_name": MODEL_NAME,
        "epochs": NUM_EPOCHS,
        "train_batch": PER_DEVICE_TRAIN_BATCH_SIZE,
        "eval_batch": PER_DEVICE_EVAL_BATCH_SIZE,
        "grad_accum": GRADIENT_ACCUMULATION_STEPS,
        "lr": LEARNING_RATE
    }, reinit=True)
    print("W&B run started.")

# 7) treinar
train_result = trainer.train()
print("Train finished. Metrics:", train_result.metrics)

# 8) salvar modelo/tokenizer
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")
print("Saved fine-tuned model to fine_tuned_model")

# 9) fechar wandb se usado
if USE_WANDB and os.environ.get("WANDB_API_KEY"):
    wandb.finish()
    print("W&B run finished.")


num_labels = 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at adalbertojunior/distilbert-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


W&B run started.




Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,0.5225,0.491002,0.729441,0.765387,0.764317
2,0.3477,0.572828,0.721508,0.761912,0.764317
3,0.3518,0.691707,0.711233,0.755599,0.761013




Train finished. Metrics: {'train_runtime': 4663.6971, 'train_samples_per_second': 2.334, 'train_steps_per_second': 0.292, 'total_flos': 163350930770784.0, 'train_loss': 0.41069862075839275, 'epoch': 3.0}
Saved fine-tuned model to fine_tuned_model


0,1
eval/accuracy,██▁
eval/f1_macro,█▅▁
eval/f1_weighted,█▆▁
eval/loss,▁▄█
eval/runtime,▂█▁
eval/samples_per_second,▇▁█
eval/steps_per_second,▇▁█
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇████
train/grad_norm,▁▁▃▃▂▂▂▃▂▂▃▄▂▃▄▅▂▄▃▃▃▂▃█▁▃▅

0,1
eval/accuracy,0.76101
eval/f1_macro,0.71123
eval/f1_weighted,0.7556
eval/loss,0.69171
eval/runtime,114.0794
eval/samples_per_second,7.959
eval/steps_per_second,0.254
total_flos,163350930770784.0
train/epoch,3
train/global_step,1362


W&B run finished.


## 7 - Avaliação final do fine-tuned model (val & test) + salvar e log no W&B como artifact

In [8]:
# Avaliação final (val + test) + salvar JSON + log & artifact no W&B (se habilitado)
import os, json, numpy as np, torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Carrega o modelo fine-tuned salvo
model_eval = AutoModelForSequenceClassification.from_pretrained("fine_tuned_model")
model_eval.to(device)

# 2) Trainer para avaliação (report_to=[] evita duplicar logs)
eval_args = TrainingArguments(
    output_dir="tmp_eval",
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    dataloader_num_workers=4,
    report_to=[],
)
trainer_eval = Trainer(
    model=model_eval,
    args=eval_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
)

# Função utilitária para extrair preds de forma robusta
def extract_preds(pred_out):
    logits = pred_out.predictions
    if logits is None:
        raise RuntimeError("No logits returned by prediction.")
    if isinstance(logits, tuple):
        logits = logits[0]
    logits = np.asarray(logits)
    if logits.ndim == 1:
        preds = logits.astype(int)
    else:
        preds = np.argmax(logits, axis=-1)
    labels = pred_out.label_ids
    return preds, labels

# 3) Prever/evaluar validation
print("Predicting validation...")
pred_out_val = trainer_eval.predict(tokenized_val)
preds_val, labels_val = extract_preds(pred_out_val)

# 4) Prever/evaluar test
print("Predicting test...")
pred_out_test = trainer_eval.predict(tokenized_test)
preds_test, labels_test = extract_preds(pred_out_test)

# 5) Calcular F1 com evaluate (conforme enunciado)
f1 = evaluate.load("f1")
f1_macro_val = f1.compute(references=labels_val, predictions=preds_val, average="macro")["f1"]
f1_weighted_val = f1.compute(references=labels_val, predictions=preds_val, average="weighted")["f1"]
f1_macro_test = f1.compute(references=labels_test, predictions=preds_test, average="macro")["f1"]
f1_weighted_test = f1.compute(references=labels_test, predictions=preds_test, average="weighted")["f1"]

results_ft = {
    "model": "fine_tuned_model",
    "f1_macro_val": float(f1_macro_val),
    "f1_weighted_val": float(f1_weighted_val),
    "f1_macro_test": float(f1_macro_test),
    "f1_weighted_test": float(f1_weighted_test),
}

print("Fine-tuned results:", results_ft)

# 6) Salva resultados localmente
with open("fine_tuned_results.json", "w") as f:
    json.dump(results_ft, f, indent=2)
print("Saved fine_tuned_results.json")

# 7) Log + artifact no W&B (opcional)
if USE_WANDB:
    if not os.environ.get("WANDB_API_KEY"):
        key = input("Cole sua WANDB API KEY (ou Enter para pular): ").strip()
        if key:
            os.environ["WANDB_API_KEY"] = key

    if os.environ.get("WANDB_API_KEY"):
        import wandb, pandas as pd
        from wandb import Artifact
        run = wandb.init(project=PROJECT_NAME, name=RUN_NAME + "-final", reinit=True)
        # log métricas
        run.log({
            "ft/f1_macro_val": results_ft["f1_macro_val"],
            "ft/f1_weighted_val": results_ft["f1_weighted_val"],
            "ft/f1_macro_test": results_ft["f1_macro_test"],
            "ft/f1_weighted_test": results_ft["f1_weighted_test"],
        })

        # sample table (test)
        sample_n = min(200, len(tokenized_test))
        texts = [ds["test"][i][text_col] for i in range(sample_n)]
        df = pd.DataFrame({"text": texts, "label": labels_test[:sample_n].tolist(), "pred": preds_test[:sample_n].tolist()})
        run.log({"ft/sample_predictions_test": wandb.Table(dataframe=df)})

        # artifact do modelo (diretório salvo)
        artifact = Artifact(name="distilbert-hatespeech-finetuned", type="model", metadata={"base_model": MODEL_NAME})
        artifact.add_dir("fine_tuned_model")
        run.log_artifact(artifact)

        run.finish()
        print("Fine-tuned metrics and artifact logged to W&B.")
    else:
        print("W&B key not set — skipped W&B logging.")
else:
    print("W&B disabled; results saved locally.")


Predicting validation...


  trainer_eval = Trainer(


Predicting test...


Fine-tuned results: {'model': 'fine_tuned_model', 'f1_macro_val': 0.7294413162013289, 'f1_weighted_val': 0.765386992408624, 'f1_macro_test': 0.7336612734659238, 'f1_weighted_test': 0.7661262997842555}
Saved fine_tuned_results.json


[34m[1mwandb[0m: Adding directory to artifact (fine_tuned_model)... Done. 7.2s


0,1
ft/f1_macro_test,▁
ft/f1_macro_val,▁
ft/f1_weighted_test,▁
ft/f1_weighted_val,▁

0,1
ft/f1_macro_test,0.73366
ft/f1_macro_val,0.72944
ft/f1_weighted_test,0.76613
ft/f1_weighted_val,0.76539


Fine-tuned metrics and artifact logged to W&B.


## 8 - Comparação final e resumo

In [9]:
# Compara os resultados da linha de base com os resultados do modelo fine-tuned usando o F1 macro no conjunto de validação.
import json, os

# Verifica se o arquivo de resultados da linha de base existe
if os.path.exists("baseline_val_results.json"):
    # Carrega os resultados da linha de base
    baseline = json.load(open("baseline_val_results.json"))
    # Carrega os resultados do modelo fine-tuned
    ft = json.load(open("fine_tuned_results.json"))
    # Imprime o F1 macro de validação para ambos os modelos
    print("Baseline val f1_macro:", baseline["f1_macro"])
    print("Fine-tuned val f1_macro:", ft["f1_macro_val"])
    # Determina qual modelo teve melhor desempenho no F1 macro de validação
    better = "fine-tuned" if ft["f1_macro_val"] > baseline["f1_macro"] else "baseline"
    print("Best on validation (F1 macro):", better)
    # Cria um dicionário de resumo da comparação
    summary = {"baseline": baseline, "fine_tuned": ft, "best": better}
    # Salva o resumo da comparação em um arquivo JSON
    with open("compare_summary.json","w") as f:
        json.dump(summary,f,indent=2)
    print("Compare summary saved to compare_summary.json")
else:
    # Mensagem se o arquivo de resultados da linha de base não for encontrado
    print("Baseline results not found. Run Célula 5 first.")

Baseline val f1_macro: 0.37138174020754355
Fine-tuned val f1_macro: 0.7294413162013289
Best on validation (F1 macro): fine-tuned
Compare summary saved to compare_summary.json


## 9 -Checagem final (arquivos gerados / labels)

In [10]:
# Realiza uma checagem final para verificar a existência dos arquivos de resultados gerados e exibe os nomes das classes (labels).
import os, json

# Lista os nomes dos arquivos de resultados esperados
for fname in ["baseline_val_results.json","fine_tuned_results.json","compare_summary.json"]:
    # Verifica a existência de cada arquivo e imprime o status
    print(fname, "->", "exists" if os.path.exists(fname) else "MISSING")
# Se o arquivo de resultados da linha de base existir, carrega e imprime seu conteúdo
if os.path.exists("baseline_val_results.json"):
    print("Baseline:", json.load(open("baseline_val_results.json")))
# Se o arquivo de resultados do modelo fine-tuned existir, carrega e imprime seu conteúdo
if os.path.exists("fine_tuned_results.json"):
    print("Fine-tuned:", json.load(open("fine_tuned_results.json")))
# Se o arquivo de resumo da comparação existir, carrega e imprime seu conteúdo
if os.path.exists("compare_summary.json"):
    print("Compare summary:", json.load(open("compare_summary.json")))

# Tenta imprimir os nomes das classes (labels) definidos no dataset tokenizado
try:
    print("Label names:", tokenized_train.features["labels"].feature.names)
except Exception:
    # Ignora se não for possível acessar os nomes das classes
    pass

baseline_val_results.json -> exists
fine_tuned_results.json -> exists
compare_summary.json -> exists
Baseline: {'model': 'adalbertojunior/distilbert-portuguese-cased', 'f1_macro': 0.37138174020754355, 'f1_weighted': 0.3326977272866276}
Fine-tuned: {'model': 'fine_tuned_model', 'f1_macro_val': 0.7294413162013289, 'f1_weighted_val': 0.765386992408624, 'f1_macro_test': 0.7336612734659238, 'f1_weighted_test': 0.7661262997842555}
Compare summary: {'baseline': {'model': 'adalbertojunior/distilbert-portuguese-cased', 'f1_macro': 0.37138174020754355, 'f1_weighted': 0.3326977272866276}, 'fine_tuned': {'model': 'fine_tuned_model', 'f1_macro_val': 0.7294413162013289, 'f1_weighted_val': 0.765386992408624, 'f1_macro_test': 0.7336612734659238, 'f1_weighted_test': 0.7661262997842555}, 'best': 'fine-tuned'}
