In [1]:
# === Cell 0: Load your dataset ===
from datasets import load_dataset

raw_dataset = load_dataset("go_emotions", "simplified", cache_dir="./data")

label_names = raw_dataset["train"].features["labels"].feature.names if "feature" in str(type(raw_dataset["train"].features["labels"])) else []
num_labels = len(label_names) if label_names else (
    raw_dataset["train"].features["labels"].num_classes
    if hasattr(raw_dataset["train"].features["labels"], "num_classes")
    else len(set(sum(raw_dataset["train"]["labels"], [])))  # fallback for multi-label
)
label_type = "multi-label" if isinstance(raw_dataset["train"].features["labels"], list) or len(raw_dataset["train"]["labels"][0]) > 1 else "single-label"

print(f"Loaded dataset with {len(raw_dataset['train'])} train samples.")
print(f"Label type: {label_type}, num_labels: {num_labels}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Loaded dataset with 43410 train samples.
Label type: single-label, num_labels: 28


In [9]:
# All supported GoEmotions schemas
RAW_LABELS = [
    "admiration","amusement","anger","annoyance","approval","caring","confusion",
    "curiosity","desire","disappointment","disapproval","disgust","embarrassment",
    "excitement","fear","gratitude","grief","joy","love","nervousness","optimism",
    "pride","realization","relief","remorse","sadness","surprise","neutral"
]

SIMPLIFIED_LABELS = [
    "admiration","amusement","anger","approval","caring","confusion",
    "curiosity","disappointment","disapproval","embarrassment","excitement","fear",
    "gratitude","joy","love","nervousness","optimism","pride","realization","sadness","neutral"
]


In [7]:
def detect_label_schema(ds, config: str):
    """
    Return the label type (single vs multi) and label names for a given config.
    """
    if config == "raw":
        return "multi-label", ds["train"].features["labels"].feature.names #RAW_LABELS
    elif config == "simplified":
        return "single-label", ds["train"].features["labels"].feature.names #SIMPLIFIED_LABELS
    else:
        raise ValueError(f"Unsupported config: {config}")


In [11]:
# === Ensure Validation Split (fixed import) ===
from datasets import DatasetDict

def ensure_validation(ds: DatasetDict, seed: int, val_frac: float) -> DatasetDict:
    if "validation" in ds:
        return ds
    split = ds["train"].train_test_split(test_size=val_frac, seed=seed)
    return DatasetDict(
        train=split["train"],
        validation=split["test"],
        test=ds["test"] if "test" in ds else split["test"]
    )


In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
import numpy as np
import torch


model_name = "microsoft/deberta-v3-base"   # if OOM, switch to "microsoft/deberta-v3-small"
max_length = 256

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def _is_binary_vec(v):
    try:
        s = set(int(x) for x in v)
        return s.issubset({0, 1})
    except Exception:
        return False

def _to_multi_hot_from_indices(idxs, C):
    vec = [0]*C
    for i in idxs:
        if 0 <= int(i) < C:
            vec[int(i)] = 1
    return vec

def _fix_single_label(lbl):
    # scalar int
    if isinstance(lbl, (int, np.integer)):
        return int(lbl)
    # list/array cases
    if isinstance(lbl, (list, tuple, np.ndarray)):
        if len(lbl) == 1:
            return int(lbl[0])
        if len(lbl) == num_labels and _is_binary_vec(lbl):
            # one-hot -> index
            return int(np.argmax(np.array(lbl)))
        # fallback: take first element as class id
        return int(lbl[0])
    # last resort
    return int(lbl)

def _fix_multi_label(lbl):
    # already a length-C binary vector
    if isinstance(lbl, (list, tuple, np.ndarray)):
        if len(lbl) == num_labels and _is_binary_vec(lbl):
            return [int(x) for x in lbl]
        # list of label indices -> multi-hot
        if all(isinstance(x, (int, np.integer)) for x in lbl) and not _is_binary_vec(lbl):
            return _to_multi_hot_from_indices(lbl, num_labels)
        # pad/clip to C
        v = list(lbl)
        if len(v) < num_labels:
            v = v + [0]*(num_labels - len(v))
        elif len(v) > num_labels:
            v = v[:num_labels]
        # force binary
        return [1 if int(x) != 0 else 0 for x in v]
    # scalar class id -> one-hot with that index
    if isinstance(lbl, (int, np.integer)):
        v = [0]*num_labels
        if 0 <= int(lbl) < num_labels:
            v[int(lbl)] = 1
        return v
    # last resort: empty vector
    return [0]*num_labels

def _tok_and_fix(batch):
    texts = batch["text"]
    enc = tokenizer(texts, truncation=True, max_length=max_length)
    fixed = []
    for lbl in batch["labels"]:
        if label_type == "single-label":
            fixed.append(_fix_single_label(lbl))
        else:
            fixed.append(_fix_multi_label(lbl))
    enc["labels"] = fixed
    return enc

# remove everything except what we create
remove_cols = [c for c in raw_dataset["train"].column_names if c not in ("text","labels")]

encoded = raw_dataset.map(
    _tok_and_fix,
    batched=True,
    remove_columns=remove_cols
)

collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None
)

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type=("multi_label_classification" if label_type == "multi-label" else "single_label_classification"),
    id2label={i: n for i, n in enumerate(label_names)},
    label2id={n: i for i, n in enumerate(label_names)},
)


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
problem_type = "multi_label_classification" if label_type == "multi-label" else "single_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type=problem_type,
    id2label={i: n for i, n in enumerate(label_names)},
    label2id={n: i for i, n in enumerate(label_names)},
)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# === TrainingArguments setup ===
import torch
from transformers import TrainingArguments

output_dir = "./out_debv3"
seed = 42  # keep your previous seed if you had one

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=8,
    learning_rate=2e-5,
    weight_decay=0.05,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    seed=seed,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=False
)


In [19]:
# === Cell: Metrics Function ===
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # handle multi-label vs single-label
    if label_type == "multi-label":
        probs = 1 / (1 + np.exp(-logits))
        preds = (probs >= 0.5).astype(int)
    else:
        preds = np.argmax(logits, axis=1)
        if labels.ndim > 1:
            labels = np.argmax(labels, axis=1)

    micro = f1_score(labels, preds, average="micro", zero_division=0)
    macro = f1_score(labels, preds, average="macro", zero_division=0)
    weighted = f1_score(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="macro", zero_division=0)
    rec = recall_score(labels, preds, average="macro", zero_division=0)

    return {
        "accuracy": acc,
        "f1_micro": micro,
        "f1_macro": macro,
        "f1_weighted": weighted,
        "precision": prec,
        "recall": rec,
    }


In [21]:
# ==== PTQ helpers (define ONCE) ====
import os
import contextlib, torch.nn as nn

# Stable CPU threading for repeatable latency
torch.set_num_threads(max(1, (os.cpu_count() or 4)//2))

def _is_quantized(m: torch.nn.Module) -> bool:
    # Dynamic quantized Linear modules are CPU-only
    return any("Quantized" in mod.__class__.__name__ for mod in m.modules())

def quantize_dynamic_linear(fp32_model: nn.Module) -> nn.Module:
    return torch.quantization.quantize_dynamic(fp32_model, {nn.Linear}, dtype=torch.qint8)

def _sigmoid(x): return 1.0 / (1.0 + np.exp(-x))
def _binarize(logits, thr=0.5): return (_sigmoid(logits) >= thr).astype(int)

def _to_multihot(item, num_labels: int) -> np.ndarray:
    """Convert one label item to a multi-hot vector of shape (num_labels,)."""
    if item is None:
        return np.zeros(num_labels, dtype=np.float32)

    # List/tuple/array
    if isinstance(item, (list, tuple, np.ndarray)):
        arr = np.array(item, dtype=np.float32).reshape(-1)
        # Case A: already a multi-hot vector of correct length
        if arr.size == num_labels and np.all((arr == 0) | (arr == 1)):
            return arr
        # Case B: treat as list of indices
        vec = np.zeros(num_labels, dtype=np.float32)
        for j in item:
            ji = int(j)
            if 0 <= ji < num_labels:
                vec[ji] = 1.0
        return vec

    # Scalar -> treat as single index
    ji = int(item)
    vec = np.zeros(num_labels, dtype=np.float32)
    if 0 <= ji < num_labels:
        vec[ji] = 1.0
    return vec


def evaluate_on_cpu(model, tokenizer, ds, split="test", label_type="single-label",
                    num_labels=None, label_names=None,
                    max_len=128, batch_size=32, threshold=0.5):
    """
    Pure-CPU eval used for FP32 vs INT8 PTQ comparisons.
    Accepts labels as ints, list of indices, or multi-hot vectors.
    """
    import time, torch
    from sklearn.metrics import f1_score, classification_report

    device = "cpu"
    model.eval().to(device)

    texts  = ds[split]["text"]
    labels = np.asarray(ds[split]["labels"], dtype=object)

    if label_type == "single-label":
        # ints only
        y_true = np.array([int(l[0]) if isinstance(l, list) else int(l) for l in labels], dtype=int)
    else:
        assert num_labels is not None, "Provide num_labels for multi-label."
        # accept multi-hot OR index-lists
        y_true = np.vstack([_to_multihot(l, num_labels) for l in labels]).astype(np.float32)

    # Warmup (not timed)
    with torch.no_grad():
        w = tokenizer(["warmup"], return_tensors="pt", padding=True, truncation=True, max_length=max_len)
        w = {k: v.to(device) for k, v in w.items()}
        _ = model(**w)

    # Timed forward passes
    logits_all, t0 = [], time.time()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
            enc = {k: v.to(device) for k, v in enc.items()}
            out = model(**enc)
            logits_all.append(out.logits.detach().cpu().numpy())
    latency_sec = time.time() - t0

    logits = np.vstack(logits_all)
    if label_type == "single-label":
        y_pred = np.argmax(logits, axis=1)
    else:
        # thresholding for multilabel
        y_pred = (1.0 / (1.0 + np.exp(-logits)) >= threshold).astype(int)

    rep = classification_report(
        y_true, y_pred, target_names=(label_names or None), zero_division=0, output_dict=True
    )
    return {
        "f1_micro": float(f1_score(y_true, y_pred, average="micro",    zero_division=0)),
        "f1_macro": float(f1_score(y_true, y_pred, average="macro",    zero_division=0)),
        "f1_weighted": float(f1_score(y_true, y_pred, average="weighted", zero_division=0)),
        "latency_sec": float(latency_sec),
        "report": rep,
        "n_examples": int(len(texts)),
    }

def measure_latency(
    model,
    tokenizer,
    *,
    device: str = "cpu",
    max_length: int = 128,
    batch_size: int = 32,
    iters: int = 30,
    warmup: int = 5,
    autocast: bool = False,
    amp_dtype: str = "float16",
) -> float:
    """
    Safe latency timer:
    - Moves model to the chosen device
    - Pre-encodes once (tokenization not timed)
    - Syncs CUDA for correct timings
    - Forces CPU if model is quantized
    """
    model.eval()

    if _is_quantized(model):
        device = "cpu"

    model = model.to(device)

    sents = ["This is a sample sentence about feelings."] * batch_size
    enc = tokenizer(sents, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    enc = {k: v.to(device) for k, v in enc.items()}

    if autocast and device == "cuda":
        cast_ctx = torch.autocast(device_type="cuda",
                                  dtype=(torch.float16 if amp_dtype=="float16" else torch.bfloat16))
    else:
        cast_ctx = contextlib.nullcontext()

    with torch.no_grad(), cast_ctx:
        if device == "cuda":
            torch.cuda.synchronize()
        for _ in range(warmup):
            _ = model(**enc)
        if device == "cuda":
            torch.cuda.synchronize()

        start = time.time()
        for _ in range(iters):
            _ = model(**enc)
        if device == "cuda":
            torch.cuda.synchronize()
        end = time.time()

    return (end - start) * 1000.0 / iters

In [22]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8 if torch.cuda.is_available() else None)


In [23]:
import numpy as np, torch
from transformers import Trainer

def _stack_or_array(lst):
    try:
        arr = np.array(lst)
        if arr.dtype == object:
            arr = np.stack(lst)
    except Exception:
        arr = np.array(lst, dtype=object)
        try:
            arr = np.stack(lst)
        except Exception:
            pass
    return arr

def _is_binary_matrix(a, tol=1e-6):
    return a.ndim == 2 and np.all((a == 0) | (a == 1))

def _is_one_hot(a, tol=1e-6):
    return _is_binary_matrix(a, tol) and np.allclose(a.sum(axis=1), 1.0, atol=tol)

# 1) Inspect train labels and normalize
train_labels_list = encoded["train"]["labels"]
arr = _stack_or_array(train_labels_list)

# 2) Decide actual/effective label type for training in this cell
if arr.ndim == 1 and np.issubdtype(arr.dtype, np.integer):
    EFFECTIVE_LABEL_TYPE = "single-label"
    y_scalar = arr.astype(int)
elif arr.ndim == 2 and arr.shape[1] == num_labels and _is_one_hot(arr):
    EFFECTIVE_LABEL_TYPE = "single-label"
    y_scalar = np.argmax(arr, axis=1).astype(int)  # convert one-hot -> class index
elif arr.ndim == 2 and arr.shape[1] == num_labels:
    EFFECTIVE_LABEL_TYPE = "multi-label"
    Y_multi = arr.astype(np.float32)               # assume multi-hot/binary rows (fix later if not)
else:
    # Fallback: try to coerce lists-of-lists
    try:
        arr = np.stack(train_labels_list)
        if arr.ndim == 2 and arr.shape[1] == num_labels and _is_one_hot(arr):
            EFFECTIVE_LABEL_TYPE = "single-label"
            y_scalar = np.argmax(arr, axis=1).astype(int)
        else:
            EFFECTIVE_LABEL_TYPE = "multi-label"
            Y_multi = arr.astype(np.float32)
    except Exception:
        # Last resort: treat as multi-label with variable rows — pad/trim to num_labels
        EFFECTIVE_LABEL_TYPE = "multi-label"
        fixed = []
        for row in train_labels_list:
            row = list(row)
            if len(row) < num_labels:
                row = row + [0]*(num_labels - len(row))
            elif len(row) > num_labels:
                row = row[:num_labels]
            fixed.append(row)
        Y_multi = np.array(fixed, dtype=np.float32)

# 3) Build class weights
def effective_num_weights(counts, beta=0.999):
    counts = np.asarray(counts, dtype=np.float64)
    en = (1.0 - np.power(beta, counts)) / (1.0 - beta + 1e-12)
    w = 1.0 / np.maximum(en, 1e-12)
    return (w / (w.mean() + 1e-12)).astype(np.float32)

if EFFECTIVE_LABEL_TYPE == "single-label":
    counts = np.bincount(y_scalar, minlength=num_labels)
    cb_weights = torch.tensor(effective_num_weights(counts, beta=0.999), dtype=torch.float32)
    pos_weight = None
else:
    # ensure binary in [0,1] for stability (clip)
    Y_multi = np.clip(Y_multi, 0.0, 1.0)
    positives = Y_multi.sum(axis=0) + 1e-6
    negatives = Y_multi.shape[0] - positives + 1e-6
    pos_weight = torch.tensor(negatives / positives, dtype=torch.float32)
    cb_weights = torch.tensor(effective_num_weights(positives, beta=0.999), dtype=torch.float32)

# 4) Trainer with Class-Balanced Focal Loss
gamma = 2.0
_eps = 1e-6

class CBFocalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if EFFECTIVE_LABEL_TYPE == "single-label":
            if labels.dtype != torch.long:
                labels = labels.long()
            ce = torch.nn.functional.cross_entropy(
                logits, labels, reduction="none", weight=cb_weights.to(logits.device)
            )
            pt = torch.softmax(logits, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1)
            pt = pt.clamp(_eps, 1.0 - _eps)
            loss = ((1.0 - pt) ** gamma) * ce
            loss = loss.mean()
        else:
            labels = labels.to(logits.dtype)
            bce = torch.nn.functional.binary_cross_entropy_with_logits(
                logits, labels, reduction="none", pos_weight=pos_weight.to(logits.device)
            )
            p = torch.sigmoid(logits).clamp(_eps, 1.0 - _eps)
            pt = torch.where(labels.bool(), p, 1.0 - p)
            w = cb_weights.to(logits.device)  # [C]
            loss = ((1.0 - pt) ** gamma) * bce * w
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss

trainer = CBFocalTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)


  trainer = CBFocalTrainer(


In [24]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [25]:
train_metrics = trainer.train()
val_metrics = trainer.evaluate()
test_metrics = trainer.evaluate(encoded["test"]) if "test" in encoded else {}

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss
50,1.2279
100,1.2704
150,1.1861
200,1.0966
250,1.0754
300,1.0255
350,1.0001
400,1.0002
450,0.9002
500,0.8374


In [26]:
output_dir = "./out_debv3"


Restore the model since training crashed however it needs a retrain this is but a test


In [None]:
# === Restore trained model from latest checkpoint (no retrain) ===
import os, glob, torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

# assumes these already exist in your notebook:
# - output_dir (e.g., "./out_debv3")
# - model_name (e.g., "microsoft/deberta-v3-base" or "-small")
# - num_labels, label_names, label_type
assert "output_dir" in globals()
assert "model_name" in globals()
assert "num_labels" in globals()
assert "label_names" in globals()
assert "label_type" in globals()

def latest_checkpoint_dir(base_dir):
    paths = sorted(glob.glob(os.path.join(base_dir, "checkpoint-*")),
                   key=lambda p: int(p.rsplit("-", 1)[-1]) if "-" in p else -1)
    return paths[-1] if paths else None

load_path = latest_checkpoint_dir(output_dir) or output_dir
print(f"Loading from: {load_path}")

problem_type = "multi_label_classification" if label_type == "multi-label" else "single_label_classification"
id2label = {i: n for i, n in enumerate(label_names)}
label2id = {n: i for i, n in enumerate(label_names)}

# 1) Try standard HF load (works when config.json has model_type)
try:
    model = AutoModelForSequenceClassification.from_pretrained(load_path)
    print("Loaded model via AutoModelForSequenceClassification.from_pretrained(load_path)")
except Exception as e:
    print("Auto load failed, reconstructing from base config:", repr(e))
    # 2) Rebuild config from the original model, then load weights file manually
    config = AutoConfig.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
        problem_type=problem_type,
    )
    model = AutoModelForSequenceClassification.from_config(config)

    cand_files = [
        os.path.join(load_path, "pytorch_model.bin"),
        os.path.join(load_path, "model.safetensors"),
        os.path.join(output_dir, "pytorch_model.bin"),
    ]
    weights_path = next((p for p in cand_files if os.path.exists(p)), None)
    if weights_path is None:
        raise FileNotFoundError(
            f"No weights file found under {load_path} or {output_dir}. "
            "Expected pytorch_model.bin or model.safetensors."
        )
    if weights_path.endswith(".bin"):
        state = torch.load(weights_path, map_location="cpu")
        missing, unexpected = model.load_state_dict(state, strict=False)
        print("Loaded .bin with missing keys:", missing, "| unexpected keys:", unexpected)
    else:
        from safetensors.torch import load_file as load_safetensors
        state = load_safetensors(weights_path)
        missing, unexpected = model.load_state_dict(state, strict=False)
        print("Loaded .safetensors with missing keys:", missing, "| unexpected keys:", unexpected)

# tokenizer: prefer checkpoint, else fall back to base model
try:
    tokenizer = AutoTokenizer.from_pretrained(load_path, use_fast=True)
    print("Loaded tokenizer from checkpoint.")
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    print("Loaded tokenizer from base model.")

# re-create Trainer (no train call)
trainer = CBFocalTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded.get("train"),
    eval_dataset=encoded.get("validation"),
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

# Optional: evaluate immediately
val_metrics = trainer.evaluate()
test_metrics = trainer.evaluate(encoded["test"]) if "test" in encoded else {}
print("Val:", val_metrics)
print("Test:", test_metrics)


Loading from: ./out_debv3/checkpoint-10856
Loaded model via AutoModelForSequenceClassification.from_pretrained(load_path)
Loaded tokenizer from checkpoint.


  trainer = CBFocalTrainer(


Val: {'eval_loss': 0.5167628526687622, 'eval_model_preparation_time': 0.0081, 'eval_accuracy': 0.5597124953925544, 'eval_f1_micro': 0.5597124953925544, 'eval_f1_macro': 0.48681600694042576, 'eval_f1_weighted': 0.5579542592757849, 'eval_precision': 0.4732015724344865, 'eval_recall': 0.512696995750624, 'eval_runtime': 15.0901, 'eval_samples_per_second': 359.574, 'eval_steps_per_second': 11.266}
Test: {'eval_loss': 0.5214838981628418, 'eval_model_preparation_time': 0.0081, 'eval_accuracy': 0.5590565690068178, 'eval_f1_micro': 0.5590565690068178, 'eval_f1_macro': 0.4890273667300305, 'eval_f1_weighted': 0.5580314263623689, 'eval_precision': 0.4691220847487056, 'eval_recall': 0.5254434289338668, 'eval_runtime': 14.3502, 'eval_samples_per_second': 378.182, 'eval_steps_per_second': 11.846}


In [None]:
# === Save metrics.json safely ===
import os, json

dataset_name = "goemotions"
dataset_config = "raw"   # or "simplified" depending on which run this is

os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, "metrics.json"), "w") as f:
    json.dump({
        "train": getattr(train_metrics, "metrics", {}),
        "val": val_metrics,
        "test": test_metrics,
        "label_names": list(label_names),
        "args": {
            "model_name": model_name,
            "dataset_name": dataset_name,
            "dataset_config": dataset_config,
            "max_length": 128,
            "batch_size": 32,
            "epochs": 6,
            "lr": 2e-5,
            "weight_decay": 0.01,
            "seed": seed,
        },
    }, f, indent=2)

print("Wrote", os.path.join(output_dir, "metrics.json"))


Wrote ./out_debv3/metrics.json


Resume as normal from here after model is trained


In [27]:
# === Per-class threshold tuning (Raw / multi-label) ===
import numpy as np, torch
from tqdm import trange

if label_type != "multi-label":
    THRESHOLDS = 0.5
else:
    model.eval(); torch.set_grad_enabled(False)

    def _collect_logits(ds):
        outs, labs = [], []
        for i in trange(0, len(ds), 64, desc="Collect val logits"):
            batch = ds[i:i+64]
            inputs = {k: torch.tensor(v) for k,v in batch.items() if k in ("input_ids","attention_mask","token_type_ids")}
            labs.append(torch.tensor(batch["labels"]).float())
            outs.append(model(**inputs).logits.cpu())
        return torch.cat(outs,0).numpy(), torch.cat(labs,0).numpy()

    val_logits, val_y = _collect_logits(encoded["validation"])
    val_p = 1/(1+np.exp(-val_logits))

    grid = np.linspace(0.05, 0.95, 37)
    best = np.full(val_p.shape[1], 0.5, dtype=np.float32)
    for c in range(val_p.shape[1]):
        best_f1, best_t = -1, 0.5
        yc = val_y[:, c].astype(int)
        for t in grid:
            pred = (val_p[:, c] >= t).astype(int)
            tp = (pred & (yc==1)).sum(); fp = (pred & (yc==0)).sum(); fn = ((1-pred) & (yc==1)).sum()
            prec = tp / max(tp+fp, 1e-9); rec = tp / max(tp+fn, 1e-9)
            f1 = 2*prec*rec / max(prec+rec, 1e-9)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        best[c] = best_t
    THRESHOLDS = best

print("Thresholds ready:", "vector" if isinstance(THRESHOLDS, np.ndarray) else THRESHOLDS)


Thresholds ready: 0.5


In [29]:
# === PTQ (FP32 vs INT8) using tuned thresholds; DataLoader + collator padding ===
import os, json, time, copy
import torch
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import f1_score

assert "collator" in globals(), "Define `collator` before running PTQ."
assert "encoded" in globals() and "model" in globals() and "tokenizer" in globals()

def quantize_dynamic_linear(m):
    q = copy.deepcopy(m).cpu()
    q.eval()
    return torch.quantization.quantize_dynamic(q, {torch.nn.Linear}, dtype=torch.qint8)

quant_dir = os.path.join(output_dir, "quant_results")
os.makedirs(quant_dir, exist_ok=True)

split_for_ptq = "test" if "test" in encoded else "validation"
thr = THRESHOLDS if ('THRESHOLDS' in globals()) else (0.5 if label_type == "multi-label" else 0.5)
BATCH_SIZE = 32

def _make_loader(ds, batch_size):
    keep = ["input_ids", "attention_mask", "labels"]
    if "token_type_ids" in ds.column_names:
        keep.append("token_type_ids")
    ds2 = ds.remove_columns([c for c in ds.column_names if c not in keep])
    return DataLoader(ds2, batch_size=batch_size, shuffle=False, collate_fn=collator)

@torch.inference_mode()
def evaluate_on_cpu(model_, ds, label_type, threshold, warmup_batches=3):
    model_ = model_.cpu()
    model_.eval()
    loader = _make_loader(ds, BATCH_SIZE)

    it = iter(loader)
    for _ in range(min(warmup_batches, len(loader))):
        try:
            batch = next(it)
        except StopIteration:
            break
        _ = model_(**{k: v for k, v in batch.items() if k != "labels"})

    t0 = time.perf_counter()
    all_logits, all_labels = [], []
    for batch in loader:
        labels = batch.pop("labels")
        outputs = model_(**batch)
        all_logits.append(outputs.logits.cpu())
        all_labels.append(labels.cpu())
    t1 = time.perf_counter()

    logits = torch.cat(all_logits, dim=0).numpy()
    labels = torch.cat(all_labels, dim=0).numpy()

    if label_type == "single-label":
        if labels.ndim > 1:  # one-hot -> indices
            labels = labels.argmax(axis=1)
        labels = labels.astype(np.int64, copy=False)
        preds = logits.argmax(axis=1)
        f1m = f1_score(labels, preds, average="macro")
    else:
        probs = 1.0 / (1.0 + np.exp(-logits))
        thr_vec = threshold
        if np.isscalar(thr_vec):
            thr_vec = np.full(probs.shape[1], thr_vec, dtype=np.float32)
        preds = (probs >= thr_vec).astype(int)
        if labels.ndim == 1:  # indices -> binary matrix
            lab_bin = np.zeros((labels.shape[0], probs.shape[1]), dtype=int)
            lab_bin[np.arange(labels.shape[0]), labels.astype(int)] = 1
            labels = lab_bin
        f1m = f1_score(labels, preds, average="macro")

    return {"f1_macro": float(f1m), "latency_sec": float(t1 - t0)}

ds_eval = encoded[split_for_ptq]

fp32_metrics = evaluate_on_cpu(model, ds_eval, label_type, thr, warmup_batches=3)

qmodel = quantize_dynamic_linear(model)
int8_metrics = evaluate_on_cpu(qmodel, ds_eval, label_type, thr, warmup_batches=3)

ptq_payload = {
    "model_id_or_path": model_name,
    "split": split_for_ptq,
    "threshold": (thr.tolist() if hasattr(thr, "tolist") else thr),
    "batch_size": BATCH_SIZE,
    "fp32": fp32_metrics,
    "int8_dynamic": int8_metrics
}
with open(os.path.join(quant_dir, "ptq_eval.json"), "w") as f:
    json.dump(ptq_payload, f, indent=2)

speedup = fp32_metrics["latency_sec"] / max(int8_metrics["latency_sec"], 1e-9)
print(f"Saved: {os.path.join(quant_dir, 'ptq_eval.json')}")
print(f"FP32 macro={fp32_metrics['f1_macro']:.3f}  INT8 macro={int8_metrics['f1_macro']:.3f}  Speedup×={speedup:.2f}")


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  return torch.quantization.quantize_dynamic(q, {torch.nn.Linear}, dtype=torch.qint8)


Saved: ./out_debv3/quant_results/ptq_eval.json
FP32 macro=0.489  INT8 macro=0.019  Speedup×=1.28


In [30]:
def count_trainable_parameters(model: torch.nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# ---- Efficiency snapshot (safe) ----
params = count_trainable_parameters(model)

# Always measure CPU latency
latency_cpu_ms = measure_latency(model, tokenizer, device="cpu")

# Optionally measure GPU latency if available
latency_gpu_ms = None
if torch.cuda.is_available():
    # autocast=False for apples-to-apples unless you explicitly want AMP
    latency_gpu_ms = measure_latency(model, tokenizer, device="cuda", autocast=False)

efficiency = {
    "trainable_params": int(params),
    "avg_latency_ms_per_batch32_cpu": float(latency_cpu_ms),
}

if latency_gpu_ms is not None:
    efficiency["avg_latency_ms_per_batch32_gpu"] = float(latency_gpu_ms)

with open(os.path.join(output_dir, "efficiency_snapshot.json"), "w") as f:
    json.dump(efficiency, f, indent=2)

print("All done. Results saved to:", output_dir)
print("Efficiency snapshot:", efficiency)

All done. Results saved to: ./out_debv3
Efficiency snapshot: {'trainable_params': 184443676, 'avg_latency_ms_per_batch32_cpu': 1819.301684697469, 'avg_latency_ms_per_batch32_gpu': 30.704402923583984}


In [32]:
# === Save concise manifest for report ===
import os, json

# Define dataset metadata manually since it's missing upstream
dataset_name = "goemotions"
dataset_config = "raw"  # or "simplified" if you're running that variant

manifest = {
    "model_id": model_name if "model_name" in globals() else "unknown_model",
    "output_dir": output_dir,
    "dataset_name": dataset_name,
    "dataset_config": dataset_config,
    "label_type": label_type if "label_type" in globals() else "unknown",
    "num_labels": int(num_labels) if "num_labels" in globals() else 0,
    "label_names": list(label_names) if "label_names" in globals() else [],
    "train_args": {
        "epochs": 3,
        "batch_size": 32,
        "lr": 5e-5,
        "weight_decay": 0.01,
        "seed": seed if "seed" in globals() else 42,
    },
    "ptq": {
        "baseline_dir": os.path.join(output_dir, "quant_baseline"),
        "trained_dir":  os.path.join(output_dir, "quant_results"),
        "split": "test" if "test" in encoded else "validation",
        "threshold": 0.5
    }
}

with open(os.path.join(output_dir, "run_manifest.json"), "w") as f:
    json.dump(manifest, f, indent=2)

print(" Wrote run_manifest.json to", os.path.join(output_dir, "run_manifest.json"))


 Wrote run_manifest.json to ./out_debv3/run_manifest.json


In [34]:
# === Show PTQ (FP32 vs INT8) summary ===
import os, json
import pandas as pd  # <-- this was missing

ptq_path = os.path.join(output_dir, "quant_results", "ptq_eval.json")

if os.path.exists(ptq_path):
    d = json.load(open(ptq_path))
    df = pd.DataFrame([{
        "split": d.get("split", ""),
        "thr": d.get("threshold", ""),
        "FP32 macro": d["fp32"].get("f1_macro", None),
        "INT8 macro": d["int8_dynamic"].get("f1_macro", None),
        "FP32 lat (s)": d["fp32"].get("latency_sec", None),
        "INT8 lat (s)": d["int8_dynamic"].get("latency_sec", None),
        "Speedup×": d["fp32"]["latency_sec"] / max(d["int8_dynamic"]["latency_sec"], 1e-9),
    }])
    display(df)
else:
    print("No PTQ file found at:", ptq_path)


Unnamed: 0,split,thr,FP32 macro,INT8 macro,FP32 lat (s),INT8 lat (s),Speedup×
0,test,0.5,0.488929,0.018899,802.682257,628.819995,1.27649


In [35]:
!pip -q install reportlab

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.0 MB[0m [31m16.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [36]:
import os, shutil
from google.colab import files

print("Upload make_report_v3.py")
uploaded = files.upload()
fname = next(iter(uploaded))
if fname != "make_report_v3.py":
    os.rename(fname, "make_report_v3.py")
print("make_report_v3.py uploaded.\n")

print("Upload simplified_models.zip and raw_models.zip")
uploaded_zips = files.upload()

for zip_name in uploaded_zips.keys():
    folder = os.path.splitext(zip_name)[0]
    shutil.unpack_archive(zip_name, folder)
    print(f"Extracted {folder}")

pdf_path = "./goemotions_raw_vs_simplified_report_v4.pdf"
print("Running report...")
!python ./make_report_v3.py --roots ./simplified_models ./raw_models --pdf_path $pdf_path

if os.path.exists(pdf_path):
    print("Report ready — click to download:")
    files.download(pdf_path)
else:
    print("Report not generated. Check folder names — expected ./simplified_models and ./raw_models to exist.")


Upload make_report_v3.py


Saving make_report_v3.py to make_report_v3.py
make_report_v3.py uploaded.

Upload simplified_models.zip and raw_models.zip


Saving raw_models.zip to raw_models.zip
Saving simplified_models.zip to simplified_models.zip
Extracted raw_models
Extracted simplified_models
Running report...
  python make_report_v3.py --roots .\simplified_models .\raw_models --pdf_path .\goemotions_raw_vs_simplified_report.pdf
Wrote PDF: ./goemotions_raw_vs_simplified_report_v4.pdf
Report ready — click to download:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>