In [1]:
import os, json, time
import numpy as np
import torch
from typing import List, Dict

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, classification_report
from datasets import load_dataset, DatasetDict, Sequence
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)


In [2]:
# All supported GoEmotions schemas
RAW_LABELS = [
    "admiration","amusement","anger","annoyance","approval","caring","confusion",
    "curiosity","desire","disappointment","disapproval","disgust","embarrassment",
    "excitement","fear","gratitude","grief","joy","love","nervousness","optimism",
    "pride","realization","relief","remorse","sadness","surprise","neutral"
]

SIMPLIFIED_LABELS = [
    "admiration","amusement","anger","approval","caring","confusion",
    "curiosity","disappointment","disapproval","embarrassment","excitement","fear",
    "gratitude","joy","love","nervousness","optimism","pride","realization","sadness","neutral"
]


In [3]:
def detect_label_schema(ds, config: str):
    """
    Return the label type (single vs multi) and label names for a given config.
    """
    if config == "raw":
        return "multi-label", ds["train"].features["labels"].feature.names #RAW_LABELS
    elif config == "simplified":
        return "single-label", ds["train"].features["labels"].feature.names #SIMPLIFIED_LABELS
    else:
        raise ValueError(f"Unsupported config: {config}")


In [4]:
def ensure_validation(ds: DatasetDict, seed: int, val_frac: float) -> DatasetDict:
    if "validation" in ds:
        return ds
    split = ds["train"].train_test_split(test_size=val_frac, seed=seed)
    return DatasetDict(
        train=split["train"],
        validation=split["test"],
        test=ds["test"] if "test" in ds else split["test"]
    )


In [5]:
def preprocess_multilabel(batch, tokenizer, label_names):
    """
    Tokenize and convert raw labels (list of indices) into multi-hot vectors.
    """
    enc = tokenizer(batch["text"], truncation=True, max_length=128)

    # Build multi-hot label matrix
    n = len(batch["text"])
    y = np.zeros((n, len(label_names)), dtype=np.float32)
    for i, label_ids in enumerate(batch["labels"]):
        for j in label_ids:
            if 0 <= j < len(label_names):
                y[i, j] = 1.0
    enc["labels"] = y.tolist()

    return enc


def preprocess_singlelabel(batch, tokenizer):
    """
    Tokenize and ensure labels are integers, not lists.
    """
    enc = tokenizer(batch["text"], truncation=True, max_length=128)

    # Fix: flatten single-element lists and convert to int
    labels = batch["labels"]
    enc["labels"] = [int(lbl[0]) if isinstance(lbl, list) else int(lbl) for lbl in labels]

    return enc


In [6]:
# --- CONFIG --- (you can replace these with args)
dataset_name = "go_emotions"
dataset_config = "simplified"  # "raw" or "simplified"
#model_name = "distilbert-base-uncased"
#model_name = "albert-base-v2"
#model_name = "google/mobilebert-uncased"
model_name = "nreimers/MiniLM-L6-H384-uncased"

val_frac = 0.1
seed = 42

# --- Load + preprocess ---
ds = load_dataset(dataset_name, dataset_config)
ds = ensure_validation(ds, seed=seed, val_frac=val_frac)

# Decide label type from config, but get class names/count from the dataset itself
label_type = "multi-label" if dataset_config == "raw" else "single-label"

label_feature = ds["train"].features["labels"]  # ClassLabel for HF go_emotions
label_names = label_feature.feature.names if hasattr(label_feature, "feature") else label_feature.names
num_labels = len(label_names)

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_multilabel(batch):
    enc = tokenizer(batch["text"], truncation=True, max_length=128)
    n = len(batch["text"])
    y = np.zeros((n, num_labels), dtype=np.float32)
    for i, ids in enumerate(batch["labels"]):
        for j in ids:
            if 0 <= j < num_labels:
                y[i, j] = 1.0
    enc["labels"] = y.tolist()
    return enc

def preprocess_singlelabel(batch):
    enc = tokenizer(batch["text"], truncation=True, max_length=128)
    labels = batch["labels"]
    enc["labels"] = [int(lbl[0]) if isinstance(lbl, list) else int(lbl) for lbl in labels]
    return enc

if label_type == "multi-label":
    remove_cols = [c for c in ds["train"].column_names if c != "text"]
    encoded = ds.map(preprocess_multilabel, batched=True, remove_columns=remove_cols)
else:
    remove_cols = [c for c in ds["train"].column_names if c not in ["text", "labels"]]
    encoded = ds.map(preprocess_singlelabel, batched=True, remove_columns=remove_cols)

# Sanity checks
tr_labels = encoded["train"]["labels"]
if label_type == "single-label":
    assert all(isinstance(x, int) for x in tr_labels), "Labels must be ints for single-label."
    assert max(tr_labels) < num_labels and min(tr_labels) >= 0, "Found label outside 0..num_labels-1."
else:
    arr = np.asarray(tr_labels, dtype=np.float32)
    assert arr.ndim == 2 and arr.shape[1] == num_labels, "Multi-hot shape mismatch."


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [7]:
problem_type = "multi_label_classification" if label_type == "multi-label" else "single_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type=problem_type,
    id2label={i: n for i, n in enumerate(label_names)},
    label2id={n: i for i, n in enumerate(label_names)},
)


pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nreimers/MiniLM-L6-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
output_dir = f"./outputs_{model_name.replace('/', '_')}_{dataset_config}"
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none",
    seed=seed,
)


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if label_type == "multi-label":
        probs = 1 / (1 + np.exp(-logits))
        preds = (probs >= 0.5).astype(int)
    else:
        preds = np.argmax(logits, axis=1)

    micro = f1_score(labels, preds, average="micro", zero_division=0)
    macro = f1_score(labels, preds, average="macro", zero_division=0)
    weighted = f1_score(labels, preds, average="weighted", zero_division=0)
    rep = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
    with open(os.path.join(output_dir, "classification_report.json"), "w") as f:
        json.dump(rep, f, indent=2)
    return {"f1_micro": micro, "f1_macro": macro, "f1_weighted": weighted}


In [10]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8 if torch.cuda.is_available() else None)


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    processing_class=tokenizer,   # quiets the warning
    data_collator=collator,
    compute_metrics=compute_metrics,
)


In [12]:
train_metrics = trainer.train()
val_metrics = trainer.evaluate()
test_metrics = trainer.evaluate(encoded["test"]) if "test" in encoded else {}

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Step,Training Loss
100,2.8873
200,2.5946
300,2.4028
400,2.259
500,2.176
600,2.0891
700,2.0935
800,1.9916
900,1.9193
1000,1.8919


In [13]:
def count_trainable_parameters(model: torch.nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def measure_latency(model, tokenizer, device: str, max_length: int = 128, batch_size: int = 32, iters: int = 30):
    model.eval()
    sents = ["This is a sample sentence about feelings."] * batch_size
    with torch.no_grad():
        for _ in range(5):  # warmup
            _ = model(**tokenizer(sents, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device))
        start = time.time()
        for _ in range(iters):
            _ = model(**tokenizer(sents, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device))
        end = time.time()
    return (end - start) * 1000.0 / iters  # ms

with open(os.path.join(output_dir, "metrics.json"), "w") as f:
    json.dump({
        "train": train_metrics.metrics,
        "val": val_metrics,
        "test": test_metrics,
        "label_names": label_names,
        "args": {
            "model_name": model_name,
            "dataset_name": dataset_name,
            "dataset_config": dataset_config,
            "max_length": 128,
            "batch_size": 32,
            "epochs": 3,
            "lr": 5e-5,
            "weight_decay": 0.01,
            "output_dir": output_dir,
            "seed": seed,
            "eval_threshold": 0.5,
            "report_to": "none",
            "val_frac": val_frac
        }
    }, f, indent=2)

params = count_trainable_parameters(model)
latency = measure_latency(model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu")

with open(os.path.join(output_dir, "efficiency_snapshot.json"), "w") as f:
    json.dump({
        "trainable_params": int(params),
        "avg_latency_ms_per_batch32": float(latency)
    }, f, indent=2)

print("✅ All done. Results saved to:", output_dir)


✅ All done. Results saved to: ./outputs_nreimers_MiniLM-L6-H384-uncased_simplified
