# Proyecto 5 — Transformers con IMDB




## 0) Preparación e instalación de librerías

In [1]:
# --- Solo para ESTE notebook: forzar Transformers a NO usar TensorFlow ---
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"

# Comprobación opcional:
from transformers.utils import is_tf_available


## 1) Imports y carpetas de trabajo

In [2]:
import os, time, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# Carpetas para guardar resultados
os.makedirs("../results/5_transformer", exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando dispositivo:", device)

Usando dispositivo: cpu


## 2) Dataset (IMDB) — subset chico

In [3]:
# Cargamos IMDB y usamos un subset pequeño para que corra rápido en cualquier PC
dataset = load_dataset("imdb")

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch", columns=["input_ids","attention_mask","labels"])

# Subsets (ajusta números si tu PC lo permite)
train_ds = tokenized["train"].shuffle(seed=42).select(range(5000))   # 5k train
test_ds  = tokenized["test"].shuffle(seed=42).select(range(2000))    # 2k test

len(train_ds), len(test_ds)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

(5000, 2000)

## 3) Modelo (BERT base)

In [4]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)
params_count = model.num_parameters()
print("Parámetros del modelo:", params_count)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parámetros del modelo: 109483778


## 4) Métricas

In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

## 5) Entrenamiento

In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="../results/5_transformer/checkpoints",
    # parámetros compatibles con versiones antiguas
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="../results/5_transformer/logs",
    logging_steps=50
)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

t0 = time.time()
trainer.train()
t1 = time.time()

train_time_sec = round(t1 - t0, 1)
print("Tiempo total de entrenamiento (s):", train_time_sec)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

## 6) Evaluación y matriz de confusión

In [None]:
metrics = trainer.evaluate(test_ds)
print("Métricas de evaluación:", metrics)

pred_out = trainer.predict(test_ds)
y_true = np.array(test_ds["labels"])
y_pred = np.argmax(pred_out.predictions, axis=1)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(cm)
fig = disp.plot().figure_
fig.suptitle("Matriz de confusión — IMDB (BERT base)")

# Guardar figura
cm_path = "../results/5_transformer/confusion_matrix.png"
fig.savefig(cm_path, bbox_inches="tight")
print("Guardado:", cm_path)

## 7) Curvas de aprendizaje (loss vs epoch)

In [None]:
# Extraemos historial de entrenamiento desde el Trainer
history = trainer.state.log_history

# Filtramos por loss de entrenamiento y evaluación
train_losses = [(h['epoch'], h['loss']) for h in history if 'loss' in h and 'epoch' in h]
eval_losses  = [(h['epoch'], h['eval_loss']) for h in history if 'eval_loss' in h and 'epoch' in h]

# Graficar (una figura por gráfico, sin estilos ni colores específicos)
# Entrenamiento
plt.figure()
if train_losses:
    xs = [x for x, _ in train_losses]
    ys = [y for _, y in train_losses]
    plt.plot(xs, ys, marker='o')
    plt.title("Training loss vs epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    tr_path = "../results/5_transformer/training_loss.png"
    plt.savefig(tr_path, bbox_inches="tight")
    print("Guardado:", tr_path)
    plt.show()
else:
    print("No hay logs de training loss en el historial.")

# Evaluación
plt.figure()
if eval_losses:
    xs = [x for x, _ in eval_losses]
    ys = [y for _, y in eval_losses]
    plt.plot(xs, ys, marker='o')
    plt.title("Eval loss vs epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    ev_path = "../results/5_transformer/eval_loss.png"
    plt.savefig(ev_path, bbox_inches="tight")
    print("Guardado:", ev_path)
    plt.show()
else:
    print("No hay logs de eval loss en el historial.")

## 8) Ejemplos correctos e incorrectos

In [None]:
df_preds = pd.DataFrame({
    "text": [t for t in load_dataset("imdb")["test"]["text"][:len(y_pred)]],
    "true": y_true,
    "pred": y_pred
})

correctos = df_preds[df_preds.true==df_preds.pred].head(5)
incorrectos = df_preds[df_preds.true!=df_preds.pred].head(5)

print("Ejemplos correctos (primeros 5):")
display(correctos[["text","true","pred"]])

print("\nEjemplos incorrectos (primeros 5):")
display(incorrectos[["text","true","pred"]])

# Guardar a CSV
ex_path = "../results/5_transformer/examples.csv"
df_preds.to_csv(ex_path, index=False, encoding="utf-8")
print("Guardado:", ex_path)

## 9) Guardar métricas y actualizar summary.csv

In [None]:
# Guardar métricas detalladas
metrics_full = {
    "accuracy": float(metrics.get("eval_accuracy", 0.0)),
    "f1": float(metrics.get("eval_f1", 0.0)),
    "loss": float(metrics.get("eval_loss", 0.0)),
    "epochs": 2,
    "params": int(params_count),
    "train_time_sec": float(train_time_sec)
}

metrics_path = "../results/5_transformer/metrics.json"
with open(metrics_path, "w", encoding="utf-8") as f:
    json.dump(metrics_full, f, ensure_ascii=False, indent=2)
print("Guardado:", metrics_path)

# Agregar/actualizar summary.csv
summary_path = "../results/summary.csv"
row = {
    "task": "Transformer-IMDB",
    "model_name": "bert-base-uncased",
    "accuracy": metrics_full["accuracy"],
    "f1": metrics_full["f1"],
    "loss": metrics_full["loss"],
    "epochs": metrics_full["epochs"],
    "params": metrics_full["params"],
    "train_time_sec": metrics_full["train_time_sec"],
    "notes": "Fine-tune en subset IMDB (5000 train / 2000 test)"
}

if os.path.exists(summary_path):
    df = pd.read_csv(summary_path)
    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
else:
    df = pd.DataFrame([row])

df.to_csv(summary_path, index=False, encoding="utf-8")
print("Actualizado:", summary_path)
df.tail(5)