In [1]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer




2025-06-03 23:25:37.711008: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-03 23:25:37.711223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-03 23:25:37.838945: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-03 23:25:38.101448: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# ============================================================
# finetune_emotion_es.py
# Fine-tuning de “daveni/twitter-xlm-roberta-emotion-es”
# sobre tu dataset de tweets en español con etiquetas emocionales
# ============================================================

# 1) ──────────────── LIBRERÍAS ──────────────────────────────
from datasets   import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd, numpy as np, re, torch, pathlib, csv, os

# 2) ──────────────── CARGA + LIMPIEZA DEL CSV ───────────────
def detect_delimiter(path):
    """Detecta automáticamente ',', ';' o tab."""
    with open(path, "r", encoding="utf-8") as f:
        sample = "".join([f.readline() for _ in range(5)])
    return csv.Sniffer().sniff(sample, delimiters=[",", ";", "\t"]).delimiter

CSV_PATH = "dataset.csv"
delimiter = detect_delimiter(CSV_PATH)

def clean_tweet(text: str) -> str:
    """• Quita URLs, @menciones, hashtags y tokens HASHTAG/USER
       • Colapsa espacios múltiples."""
    text = re.sub(r"https?://\S+", " ", text)            # URLs
    text = re.sub(r"@\w+", " ", text)                    # @menciones reales
    text = re.sub(r"#\w+", " ", text)                    # hashtags reales (#libros)
    text = re.sub(r"\b(HASHTAG|USER)\b", " ", text, flags=re.I)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df = pd.read_csv(CSV_PATH, delimiter=delimiter)
df = df.dropna(subset=["tweet", "label"])
df["tweet"] = df["tweet"].astype(str).apply(clean_tweet)

# 3) ──────────────── ENCODE DE ETIQUETAS ───────────────────
labels     = sorted(df["label"].unique())           # p.e. ['anger','joy',...]
label2id   = {lab: idx for idx, lab in enumerate(labels)}
id2label   = {idx: lab for lab, idx in label2id.items()}
df["label"] = df["label"].map(label2id)            

# 4) ──────────────── SPLIT TRAIN / VALID ───────────────────
train_df = df.sample(frac=0.8, random_state=42)
val_df   = df.drop(train_df.index)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))

# 5) ──────────────── TOKENIZADOR + MODELO ──────────────────
MODEL_NAME = "daveni/twitter-xlm-roberta-emotion-es"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# 6) ──────────────── TOKENIZACIÓN ──────────────────────────
def tokenize(batch):
    return tokenizer(
        batch["tweet"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize,   batched=True)

cols = ["input_ids", "attention_mask", "label"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(  type="torch", columns=cols)

# 7) ──────────────── ARGUMENTOS DE ENTRENAMIENTO ───────────
training_args = TrainingArguments(
    output_dir            = "./emotion_model",
    evaluation_strategy   = "epoch",
    save_strategy         = "epoch",
    logging_strategy      = "epoch",
    learning_rate         = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 16,
    num_train_epochs      = 3,
    weight_decay          = 0.01,
    load_best_model_at_end= True,
    metric_for_best_model = "accuracy",
)

# 8) ──────────────── MÉTRICAS PERSONALIZADAS ───────────────
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1":       f1_score(labels, preds, average="weighted"),
    }

# 9) ──────────────── TRAINER Y FINE-TUNING ────────────────
trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_ds,
    eval_dataset    = val_ds,
    tokenizer       = tokenizer,
    compute_metrics = compute_metrics,
)

trainer.train()

# 🔽 Guarda la versión fine-tuneada
trainer.save_model("./emotion_model")
tokenizer.save_pretrained("./emotion_model")

print("\n✅ Fine-tuning completado y modelo guardado en ./emotion_model")


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Map:   0%|          | 0/12122 [00:00<?, ? examples/s]

Map:   0%|          | 0/3030 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.736,0.55608,0.821452,0.807654
2,0.3999,0.468874,0.852145,0.842183
3,0.2457,0.470041,0.874917,0.865042



✅ Fine-tuning completado y modelo guardado en ./emotion_model


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given