In [None]:
# Instala/actualiza y reinicia el kernel antes de pasar a la siguiente celda:
%pip install --upgrade transformers datasets torch torchvision



INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision
  Downloading torchvision-0.22.0-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Downloading torchvision-0.22.0-cp312-cp312-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------ --------------------- 0.8/1.7 MB 3.7 MB/s eta 0:00:01
   ------------------------------------ --- 1.6/1.7 MB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 3.7 MB/s eta 0:00:00
Installing collected packages: torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.22.0+cu118
    Uninstalling torchvision-0.22.0+cu118:
      Successfully uninstalled torchvision-0.22.0+cu118
Successfully installed torchvision-0.22.0


In [None]:
import transformers
print(transformers.__version__)
from transformers import PreTrainedModel, BertTokenizerFast, BertForSequenceClassification
print("Imports OK")


4.52.3
Imports OK


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# 1. Cargar dataset preprocesado (campo 'text' ya limpio y 'Label')
df = pd.read_csv(r'C:/Users/carlo/OneDrive/Escritorio/Master/TFM/preprocessed.csv')

# Eliminar filas sin etiqueta y convertir 'Label' a string
df = df[df['Label'].notna()].copy()
df['Label'] = df['Label'].astype(str)

# Unificar nombre de la columna de texto si viene como 'Text'
if 'Text' in df.columns:
    df.rename(columns={'Text': 'text'}, inplace=True)

# Mapear etiquetas a IDs y renombrar columna a 'labels'
label_list = sorted(df['Label'].unique())
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
df['labels'] = df['Label'].map(label2id)

# Mostrar distribución de etiquetas
print("Distribución de etiquetas:")
print(df['labels'].value_counts())


Distribución de etiquetas:
labels
0    200
1    200
2    200
3    200
Name: count, dtype: int64


In [None]:
# Inspeccionar distribución de etiquetas antes de mapear IDs
print("Distribución de etiquetas (original):")
print(df['Label'].value_counts())

Distribución de etiquetas (original):
Label
drug and alcohol     200
early life           200
personality          200
trauma and stress    200
Name: count, dtype: int64


In [None]:
# 2. División train/test
df_train, df_test = train_test_split(
    df[['text', 'labels']],
    test_size=0.2,
    stratify=df['labels'],
    random_state=42
)
train_ds = Dataset.from_pandas(df_train)
test_ds = Dataset.from_pandas(df_test)

In [None]:
# 3. Tokenización con BERT
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
def tokenize(batch):
    # Tokenizar y devolver los componentes necesarios para el modelo
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

In [None]:
# 4. Definir modelo de clasificación con BERT
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 5. Data collator y métricas
data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1_macro': f1_score(p.label_ids, preds, average='macro')
    }

In [None]:
# 6. Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    do_train=True,
    do_eval=True,
    logging_steps=50
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# 7. Entrenamiento y evaluación
trainer.train()
metrics = trainer.evaluate()
print(metrics)



Step,Training Loss
50,1.385
100,1.0912
150,0.7327
200,0.4854




{'eval_loss': 0.888877272605896, 'eval_accuracy': 0.66875, 'eval_f1_macro': 0.6645444445818909, 'eval_runtime': 50.0156, 'eval_samples_per_second': 3.199, 'eval_steps_per_second': 0.4, 'epoch': 3.0}


In [None]:
output = trainer.predict(test_dataset)
preds = np.argmax(output.predictions, axis=1)
labels = output.label_ids

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels, preds)
import matplotlib.pyplot as plt
plt.imshow(cm, interpolation='nearest', cmap='Blues')
plt.title("Matriz de confusión")
plt.colorbar()
plt.xlabel("Predicho"); plt.ylabel("Real")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
from scipy.special import softmax
probs = softmax(output.predictions, axis=1)[:, positive_label]
fpr, tpr, _ = roc_curve(labels == positive_label, probs)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1],[0,1],"--")
plt.title("Curva ROC")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.legend(); plt.show()
