
# 5 — Transformer Fine-tuning (IMDB, clasificación de texto)
**Portfolio ML — Ejercicio 5 (Transformers — NLP)**  
Fine-tuning de un modelo Transformer (DistilBERT por defecto) para **clasificación binaria** en **IMDB**.


In [1]:

import sys
sys.path.append('../src')
import os, sys, pathlib, time, numpy as np
from sklearn.metrics import f1_score, accuracy_score
from transformer_model import create_transformer
from utils import save_training_results, save_summary_csv, plot_training_history, plot_confusion_matrix


from transformer_data import load_imdb_splits, prepare_tokenized_datasets
from transformer_train import HfTrainConfig, train_and_eval

# Asegura importar tus utilidades (ajusta ruta a tu repo)
CANDIDATE_DIRS = ["../src", ".", "/mnt/data"]
for d in CANDIDATE_DIRS:
    p = pathlib.Path(d).resolve()
    if (p / "utils.py").exists():
        sys.path.insert(0, str(p))
        print(f"✔ Using utils from: {p}")
        break



NOTEBOOK_NAME = "5_transformer_finetune"
MODEL_NAME = "distilbert_imdb_finetune"
os.makedirs("results", exist_ok=True)


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

## 1) Datos — IMDB de Hugging Face Datasets

In [1]:

# %% [data]
raw_datasets, class_names = load_imdb_splits(test_size_val=0.1, seed=42)
print(raw_datasets["train"])
print(raw_datasets["validation"])
print(raw_datasets["test"])
print("Clases:", class_names)


NameError: name 'load_imdb_splits' is not defined

## 2) Tokenizador y Modelo — DistilBERT (o el que prefieras)

In [17]:
tokenizer, model = create_transformer(model_name="distilbert-base-uncased", num_labels=2)
# Tokenización y tensores
tokenized = prepare_tokenized_datasets(raw_datasets, tokenizer, max_length=256)
print(tokenized["train"])


ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


## 3) Entrenamiento con Hugging Face Trainer

In [None]:

# %% [train]
from transformer_train import HfTrainConfig, train_and_eval

cfg = HfTrainConfig(
    model_name="distilbert-base-uncased",
    output_dir="results/transformers_imdb",
    epochs=3,
    batch_size=16,
    lr=5e-5,
    weight_decay=0.01,
    max_length=256
)
trainer, history, test_metrics, y_true, y_pred, training_time = train_and_eval(model, tokenizer, tokenized, cfg)

# Curvas de aprendizaje (history es "Keras-like")
plot_training_history(history, NOTEBOOK_NAME, MODEL_NAME)


## 4) Evaluación final y guardado en `results/summary.csv`

In [None]:

# %% [eval]
test_loss = float(test_metrics.get("eval_loss", 0.0))
test_acc  = float(test_metrics.get("eval_accuracy", 0.0))
test_f1   = float(test_metrics.get("eval_f1", 0.0))

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Loss:     {test_loss:.4f}")
print(f"Test F1:       {test_f1:.4f}")
model_params = int(model.num_parameters()) if hasattr(model, "num_parameters") else sum(p.numel() for p in model.parameters())

results = save_training_results(
    history=history,
    test_accuracy=test_acc,
    test_loss=test_loss,
    f1=test_f1,
    model_name=MODEL_NAME,
    notebook_name=NOTEBOOK_NAME,
    training_time=training_time,
    model_params=model_params
)
save_summary_csv(results, NOTEBOOK_NAME)


## 5) Matriz de confusión y ejemplos "buenos/malos"

In [None]:

# %% [viz_examples]
# Matriz de confusión
plot_confusion_matrix(
    y_true=y_true,
    y_pred=y_pred,
    class_names=["negativo","positivo"],
    notebook_name=NOTEBOOK_NAME,
    model_name=MODEL_NAME
)

# Ejemplos "buenos" (bien clasificados con alta confianza) y "malos" (mal clasificados)
import torch
from torch.nn.functional import softmax

def predict_proba(texts, batch_size=8):
    model.eval()
    probs_all = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, truncation=True, padding=True, max_length=256, return_tensors="pt")
            outputs = model(**enc)
            probs = softmax(outputs.logits, dim=-1).cpu().numpy()
            probs_all.append(probs)
    return np.vstack(probs_all)

# Recuperar algunos textos del split de test para mostrar
test_texts = raw_datasets["test"]["text"]
probs = predict_proba(test_texts[:2000])  # muestra porción para no consumir demasiada RAM/tiempo
preds = probs.argmax(axis=1)
conf  = probs.max(axis=1)

indices = np.arange(len(preds))
correct = indices[preds == raw_datasets["test"]["label"][:len(preds)]]
wrong   = indices[preds != raw_datasets["test"]["label"][:len(preds)]]

# ordenar por confianza descendente para correctos y ascendente para wrong
good_idx = correct[np.argsort(-conf[correct])][:5]
bad_idx  = wrong[np.argsort(conf[wrong])][:5]

def show_examples(idx_list, title):
    print("\n" + "="*80)
    print(title)
    print("="*80)
    for i in idx_list:
        t = test_texts[i][:800].replace("\n"," ")
        true_label = "positivo" if raw_datasets["test"]["label"][i]==1 else "negativo"
        pred_label = "positivo" if preds[i]==1 else "negativo"
        print(f"[i={i}] true={true_label} | pred={pred_label} | conf={conf[i]:.3f}")
        print(t + ("..." if len(test_texts[i])>800 else ""))
        print("-"*80)

show_examples(good_idx, "EJEMPLOS BUENOS (alta confianza & correctos)")
show_examples(bad_idx,  "EJEMPLOS MALOS (baja confianza & incorrectos)")
