In [1]:
import os, json, time
import numpy as np
import torch
from typing import List, Dict

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, classification_report
from datasets import load_dataset, DatasetDict, Sequence
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)


In [2]:
# All supported GoEmotions schemas
RAW_LABELS = [
    "admiration","amusement","anger","annoyance","approval","caring","confusion",
    "curiosity","desire","disappointment","disapproval","disgust","embarrassment",
    "excitement","fear","gratitude","grief","joy","love","nervousness","optimism",
    "pride","realization","relief","remorse","sadness","surprise","neutral"
]

SIMPLIFIED_LABELS = [
    "admiration","amusement","anger","approval","caring","confusion",
    "curiosity","disappointment","disapproval","embarrassment","excitement","fear",
    "gratitude","joy","love","nervousness","optimism","pride","realization","sadness","neutral"
]


In [3]:
def detect_label_schema(ds, config: str):
    """
    Return the label type (single vs multi) and label names for a given config.
    """
    if config == "raw":
        return "multi-label", ds["train"].features["labels"].feature.names #RAW_LABELS
    elif config == "simplified":
        return "single-label", ds["train"].features["labels"].feature.names #SIMPLIFIED_LABELS
    else:
        raise ValueError(f"Unsupported config: {config}")


In [4]:
def ensure_validation(ds: DatasetDict, seed: int, val_frac: float) -> DatasetDict:
    if "validation" in ds:
        return ds
    split = ds["train"].train_test_split(test_size=val_frac, seed=seed)
    return DatasetDict(
        train=split["train"],
        validation=split["test"],
        test=ds["test"] if "test" in ds else split["test"]
    )


In [5]:
dataset_name = "go_emotions"
dataset_config = "simplified"  # "raw" or "simplified"
model_name = "distilbert-base-uncased"
#model_name = "albert-base-v2"
#model_name = "google/mobilebert-uncased"
#model_name = "nreimers/MiniLM-L6-H384-uncased"

val_frac = 0.1
seed = 42

# --- Load + preprocess ---
ds = load_dataset(dataset_name, dataset_config)
ds = ensure_validation(ds, seed=seed, val_frac=val_frac)

# -------------------- RAW vs SIMPLIFIED handling --------------------

train_cols = ds["train"].column_names
raw_has_per_class_cols = all(c in train_cols for c in RAW_LABELS)

tokenizer = AutoTokenizer.from_pretrained(model_name)

if dataset_config == "raw" and raw_has_per_class_cols:
    print("[INFO] Detected RAW schema with per-class columns.")
    label_type  = "multi-label"
    label_names = RAW_LABELS
    num_labels  = len(label_names)

    def preprocess_raw_multilabel(batch):
        enc = tokenizer(batch["text"], truncation=True, max_length=128)
        n = len(batch["text"])
        y = np.zeros((n, num_labels), dtype=np.float32)

        for j, cls in enumerate(label_names):
            col = batch.get(cls)
            if col is None:
                continue
            for i, v in enumerate(col):
                if int(v) != 0:
                    y[i, j] = 1.0
        enc["labels"] = y.tolist()
        return enc

    remove_cols = [c for c in train_cols if c != "text"]
    encoded = ds.map(preprocess_raw_multilabel, batched=True, remove_columns=remove_cols)

else:
    # SIMPLIFIED schema (single label column)
    print("[INFO] Using single-label path (SIMPLIFIED-like schema).")
    possible = [c for c in train_cols if c in ("labels", "label", "label_ids", "label_id")]
    if possible:
        label_col = possible[0]
    else:
        label_col = None
        for n, f in ds["train"].features.items():
            if hasattr(f, "names"):  # ClassLabel
                label_col = n
                break
    if label_col is None:
        raise KeyError("Could not find a single label column for simplified schema.")

    feat = ds["train"].features[label_col]
    if hasattr(feat, "names"):
        label_names = list(feat.names)
    else:
        label_names = SIMPLIFIED_LABELS[:]  # fallback

    def _flatten_ids(values):
        for v in values:
            if isinstance(v, list):
                if len(v) == 0:
                    continue
                yield int(v[0])
            else:
                yield int(v)

    observed = []
    for split in ("train", "validation", "test"):
        if split in ds:
            observed.extend(list(_flatten_ids(ds[split][label_col])))

    if not observed:
        raise ValueError("No labels found in dataset for simplified schema.")

    min_id, max_id = min(observed), max(observed)
    num_labels = len(label_names)

    if min_id < 0:
        raise AssertionError(f"Found negative label id ({min_id}). Dataset is inconsistent.")

    if max_id >= num_labels:
        extra = (max_id + 1) - num_labels
        label_names += [f"class_{i}" for i in range(num_labels, max_id + 1)]
        num_labels = len(label_names)
        print(f"[WARN] Expanded num_labels to {num_labels} to cover max observed id={max_id}")

    print(f"[INFO] label_col='{label_col}', num_labels={num_labels}")
    label_type = "single-label"

    def preprocess_singlelabel(batch):
        enc = tokenizer(batch["text"], truncation=True, max_length=128)
        raw_labs = batch[label_col]
        labs = []
        for v in raw_labs:
            vv = int(v[0]) if isinstance(v, list) and len(v) > 0 else int(v)
            if vv < 0 or vv >= num_labels:
                vv = min(max(vv, 0), num_labels - 1)
            labs.append(vv)
        enc["labels"] = labs
        return enc

    remove_cols = [c for c in train_cols if c not in ("text", label_col)]
    encoded = ds.map(preprocess_singlelabel, batched=True, remove_columns=remove_cols)

# ---- Sanity checks (labels always exist) ----
tr_labels = encoded["train"]["labels"]
if label_type == "single-label":
    assert all(isinstance(x, int) for x in tr_labels), "Labels must be ints for single-label."
    assert max(tr_labels) < num_labels and min(tr_labels) >= 0, "Found label outside 0..num_labels-1."
else:
    arr = np.asarray(tr_labels, dtype=np.float32)
    assert arr.ndim == 2 and arr.shape[1] == num_labels, f"Multi-hot shape mismatch: {arr.shape}"

print("[INFO] Encoded columns:", encoded["train"].column_names)
print(f"[INFO] label_type={label_type}, num_labels={num_labels}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[INFO] Using single-label path (SIMPLIFIED-like schema).
[WARN] Expanded num_labels to 28 to cover max observed id=27
[INFO] label_col='labels', num_labels=28


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

[INFO] Encoded columns: ['text', 'labels', 'input_ids', 'attention_mask']
[INFO] label_type=single-label, num_labels=28


In [6]:
problem_type = "multi_label_classification" if label_type == "multi-label" else "single_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type=problem_type,
    id2label={i: n for i, n in enumerate(label_names)},
    label2id={n: i for i, n in enumerate(label_names)},
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
output_dir = f"./outputs_{model_name.replace('/', '_')}_{dataset_config}"
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none",
    seed=seed,
)


In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if label_type == "multi-label":
        probs = 1 / (1 + np.exp(-logits))
        preds = (probs >= 0.5).astype(int)
    else:
        preds = np.argmax(logits, axis=1)

    micro = f1_score(labels, preds, average="micro", zero_division=0)
    macro = f1_score(labels, preds, average="macro", zero_division=0)
    weighted = f1_score(labels, preds, average="weighted", zero_division=0)
    rep = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
    with open(os.path.join(output_dir, "classification_report.json"), "w") as f:
        json.dump(rep, f, indent=2)
    return {"f1_micro": micro, "f1_macro": macro, "f1_weighted": weighted}


In [14]:
# ==== PTQ helpers (define ONCE) ====

import contextlib, torch.nn as nn

# Stable CPU threading for repeatable latency
torch.set_num_threads(max(1, (os.cpu_count() or 4)//2))

def _is_quantized(m: torch.nn.Module) -> bool:
    # Dynamic quantized Linear modules are CPU-only
    return any("Quantized" in mod.__class__.__name__ for mod in m.modules())

def quantize_dynamic_linear(fp32_model: nn.Module) -> nn.Module:
    return torch.quantization.quantize_dynamic(fp32_model, {nn.Linear}, dtype=torch.qint8)

def _sigmoid(x): return 1.0 / (1.0 + np.exp(-x))
def _binarize(logits, thr=0.5): return (_sigmoid(logits) >= thr).astype(int)

def _to_multihot(item, num_labels: int) -> np.ndarray:
    """Convert one label item to a multi-hot vector of shape (num_labels,)."""
    if item is None:
        return np.zeros(num_labels, dtype=np.float32)

    # List/tuple/array
    if isinstance(item, (list, tuple, np.ndarray)):
        arr = np.array(item, dtype=np.float32).reshape(-1)
        # Case A: already a multi-hot vector of correct length
        if arr.size == num_labels and np.all((arr == 0) | (arr == 1)):
            return arr
        # Case B: treat as list of indices
        vec = np.zeros(num_labels, dtype=np.float32)
        for j in item:
            ji = int(j)
            if 0 <= ji < num_labels:
                vec[ji] = 1.0
        return vec

    # Scalar -> treat as single index
    ji = int(item)
    vec = np.zeros(num_labels, dtype=np.float32)
    if 0 <= ji < num_labels:
        vec[ji] = 1.0
    return vec


def evaluate_on_cpu(model, tokenizer, ds, split="test", label_type="single-label",
                    num_labels=None, label_names=None,
                    max_len=128, batch_size=32, threshold=0.5):
    """
    Pure-CPU eval used for FP32 vs INT8 PTQ comparisons.
    Accepts labels as ints, list of indices, or multi-hot vectors.
    """
    import time, torch
    from sklearn.metrics import f1_score, classification_report

    device = "cpu"
    model.eval().to(device)

    texts  = ds[split]["text"]
    labels = np.asarray(ds[split]["labels"], dtype=object)

    if label_type == "single-label":
        # ints only
        y_true = np.array([int(l[0]) if isinstance(l, list) else int(l) for l in labels], dtype=int)
    else:
        assert num_labels is not None, "Provide num_labels for multi-label."
        # accept multi-hot OR index-lists
        y_true = np.vstack([_to_multihot(l, num_labels) for l in labels]).astype(np.float32)

    # Warmup (not timed)
    with torch.no_grad():
        w = tokenizer(["warmup"], return_tensors="pt", padding=True, truncation=True, max_length=max_len)
        w = {k: v.to(device) for k, v in w.items()}
        _ = model(**w)

    # Timed forward passes
    logits_all, t0 = [], time.time()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
            enc = {k: v.to(device) for k, v in enc.items()}
            out = model(**enc)
            logits_all.append(out.logits.detach().cpu().numpy())
    latency_sec = time.time() - t0

    logits = np.vstack(logits_all)
    if label_type == "single-label":
        y_pred = np.argmax(logits, axis=1)
    else:
        # thresholding for multilabel
        y_pred = (1.0 / (1.0 + np.exp(-logits)) >= threshold).astype(int)

    rep = classification_report(
        y_true, y_pred, target_names=(label_names or None), zero_division=0, output_dict=True
    )
    return {
        "f1_micro": float(f1_score(y_true, y_pred, average="micro",    zero_division=0)),
        "f1_macro": float(f1_score(y_true, y_pred, average="macro",    zero_division=0)),
        "f1_weighted": float(f1_score(y_true, y_pred, average="weighted", zero_division=0)),
        "latency_sec": float(latency_sec),
        "report": rep,
        "n_examples": int(len(texts)),
    }

def measure_latency(
    model,
    tokenizer,
    *,
    device: str = "cpu",
    max_length: int = 128,
    batch_size: int = 32,
    iters: int = 30,
    warmup: int = 5,
    autocast: bool = False,
    amp_dtype: str = "float16",
) -> float:
    """
    Safe latency timer:
    - Moves model to the chosen device
    - Pre-encodes once (tokenization not timed)
    - Syncs CUDA for correct timings
    - Forces CPU if model is quantized
    """
    model.eval()

    if _is_quantized(model):
        device = "cpu"

    model = model.to(device)

    sents = ["This is a sample sentence about feelings."] * batch_size
    enc = tokenizer(sents, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    enc = {k: v.to(device) for k, v in enc.items()}

    if autocast and device == "cuda":
        cast_ctx = torch.autocast(device_type="cuda",
                                  dtype=(torch.float16 if amp_dtype=="float16" else torch.bfloat16))
    else:
        cast_ctx = contextlib.nullcontext()

    with torch.no_grad(), cast_ctx:
        if device == "cuda":
            torch.cuda.synchronize()
        for _ in range(warmup):
            _ = model(**enc)
        if device == "cuda":
            torch.cuda.synchronize()

        start = time.time()
        for _ in range(iters):
            _ = model(**enc)
        if device == "cuda":
            torch.cuda.synchronize()
        end = time.time()

    return (end - start) * 1000.0 / iters

In [9]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8 if torch.cuda.is_available() else None)


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    processing_class=tokenizer,   # quiets the warning
    data_collator=collator,
    compute_metrics=compute_metrics,
)


In [11]:
train_metrics = trainer.train()
val_metrics = trainer.evaluate()
test_metrics = trainer.evaluate(encoded["test"]) if "test" in encoded else {}

Step,Training Loss
100,2.5882
200,1.999
300,1.7772
400,1.653
500,1.6071
600,1.5605
700,1.5777
800,1.5533
900,1.4934
1000,1.4924


In [12]:
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, "metrics.json"), "w") as f:
    json.dump({
        # HF returns a TrainOutput; the .metrics dict is what we want
        "train": getattr(train_metrics, "metrics", {}),
        "val":   val_metrics,            # dict from Trainer.evaluate()
        "test":  test_metrics,           # {} if no test split
        "label_names": list(label_names),
        "args": {
            "model_name": model_name,
            "dataset_name": dataset_name,
            "dataset_config": dataset_config,
            "max_length": 128,
            "batch_size": 32,
            "epochs": 3,
            "lr": 5e-5,
            "weight_decay": 0.01,
            "seed": seed,
        },
    }, f, indent=2)

print("Wrote", os.path.join(output_dir, "metrics.json"))

Wrote ./outputs_distilbert-base-uncased_simplified/metrics.json


In [15]:
# Quantization (PTQ): FP32 vs INT8 on CPU with identical settings
quant_dir = os.path.join(output_dir, "quant_results")
os.makedirs(quant_dir, exist_ok=True)

split_for_ptq = "test" if "test" in encoded else "validation"
thr = 0.5  # keep 0.5 unless you’ve tuned a better threshold on the validation set

# FP32
fp32_metrics = evaluate_on_cpu(
    model, tokenizer, encoded, split=split_for_ptq,
    label_type=label_type, num_labels=num_labels, label_names=label_names,
    max_len=128, batch_size=32, threshold=thr
)

# INT8 (dynamic) copy
qmodel = quantize_dynamic_linear(model)
int8_metrics = evaluate_on_cpu(
    qmodel, tokenizer, encoded, split=split_for_ptq,
    label_type=label_type, num_labels=num_labels, label_names=label_names,
    max_len=128, batch_size=32, threshold=thr
)

# Save side-by-side for reporting
ptq_payload = {
    "model_id_or_path": model_name,     # or use output_dir if you prefer
    "split": split_for_ptq,
    "threshold": thr,
    "max_length": 128,
    "batch_size": 32,
    "fp32": fp32_metrics,
    "int8_dynamic": int8_metrics
}
with open(os.path.join(quant_dir, "ptq_eval.json"), "w") as f:
    json.dump(ptq_payload, f, indent=2)

speedup = fp32_metrics["latency_sec"] / max(int8_metrics["latency_sec"], 1e-9)
print(f"Saved: {os.path.join(quant_dir, 'ptq_eval.json')}")
print(f"FP32 macro={fp32_metrics['f1_macro']:.3f}  INT8 macro={int8_metrics['f1_macro']:.3f}  Speedup×={speedup:.2f}")

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  return torch.quantization.quantize_dynamic(fp32_model, {nn.Linear}, dtype=torch.qint8)


Saved: ./outputs_distilbert-base-uncased_simplified/quant_results/ptq_eval.json
FP32 macro=0.456  INT8 macro=0.425  Speedup×=0.97


In [16]:
def count_trainable_parameters(model: torch.nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# ---- Efficiency snapshot (safe) ----
params = count_trainable_parameters(model)

# Always measure CPU latency
latency_cpu_ms = measure_latency(model, tokenizer, device="cpu")

# Optionally measure GPU latency if available
latency_gpu_ms = None
if torch.cuda.is_available():
    # autocast=False for apples-to-apples unless you explicitly want AMP
    latency_gpu_ms = measure_latency(model, tokenizer, device="cuda", autocast=False)

efficiency = {
    "trainable_params": int(params),
    "avg_latency_ms_per_batch32_cpu": float(latency_cpu_ms),
}

if latency_gpu_ms is not None:
    efficiency["avg_latency_ms_per_batch32_gpu"] = float(latency_gpu_ms)

with open(os.path.join(output_dir, "efficiency_snapshot.json"), "w") as f:
    json.dump(efficiency, f, indent=2)

print("✅ All done. Results saved to:", output_dir)
print("Efficiency snapshot:", efficiency)

✅ All done. Results saved to: ./outputs_distilbert-base-uncased_simplified
Efficiency snapshot: {'trainable_params': 66975004, 'avg_latency_ms_per_batch32_cpu': 278.75830332438153, 'avg_latency_ms_per_batch32_gpu': 11.443217595418295}


In [17]:
# Save a concise manifest so the report knows exactly what you ran
manifest = {
    "model_id": model_name,
    "output_dir": output_dir,
    "dataset_name": dataset_name,
    "dataset_config": dataset_config,
    "label_type": label_type,             # "single-label" or "multi-label"
    "num_labels": int(num_labels),
    "label_names": list(label_names),
    "train_args": {
        "epochs": 3,
        "batch_size": 32,
        "lr": 5e-5,
        "weight_decay": 0.01,
        "seed": seed,
    },
    # what split you used for quant; this helps the report
    "ptq": {
        "baseline_dir": os.path.join(output_dir, "quant_baseline"),
        "trained_dir":  os.path.join(output_dir, "quant_results"),
        "split": "test" if "test" in encoded else "validation",
        "threshold": 0.5
    }
}
with open(os.path.join(output_dir, "run_manifest.json"), "w") as f:
    json.dump(manifest, f, indent=2)
print("Wrote run_manifest.json")


Wrote run_manifest.json


In [None]:
import json
ptq_path = os.path.join(output_dir, "quant_results", "ptq_eval.json")
if os.path.exists(ptq_path):
    d = json.load(open(ptq_path))
    pd.DataFrame([{
        "split": d["split"],
        "thr": d["threshold"],
        "FP32 micro": d["fp32"]["f1_micro"],
        "INT8 micro": d["int8_dynamic"]["f1_micro"],
        "FP32 macro": d["fp32"]["f1_macro"],
        "INT8 macro": d["int8_dynamic"]["f1_macro"],
        "FP32 lat (s)": d["fp32"]["latency_sec"],
        "INT8 lat (s)": d["int8_dynamic"]["latency_sec"],
        "Speedup×": d["fp32"]["latency_sec"]/max(d["int8_dynamic"]["latency_sec"], 1e-9),
    }])
else:
    print("No PTQ file found at:", ptq_path)


In [None]:
!pip -q install reportlab

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m59.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!python .\make_report_v2.py --roots .\simplified_models .\raw_models --pdf_path .\goemotions_raw_vs_simplified_report.pdf

Wrote PDF: ./final_goemotions_report.pdf
