In [None]:
!pip uninstall -y torch torchvision torchaudio bitsandbytes triton

# Torch 2.2.2 + cu121
!pip install --index-url https://download.pytorch.org/whl/cu121 \
    torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2

# Matching bitsandbytes (ships with libbitsandbytes_cuda121.so)
!pip install bitsandbytes==0.43.1 triton==2.2.0

# Hugging Face stack
!pip install transformers==4.43.3 datasets==2.20.0 accelerate==0.33.0 \
    peft==0.12.0 trl==0.9.6 evaluate scikit-learn

Found existing installation: torch 2.2.2+cu121
Uninstalling torch-2.2.2+cu121:
  Successfully uninstalled torch-2.2.2+cu121
Found existing installation: torchvision 0.17.2+cu121
Uninstalling torchvision-0.17.2+cu121:
  Successfully uninstalled torchvision-0.17.2+cu121
Found existing installation: torchaudio 2.2.2+cu121
Uninstalling torchaudio-2.2.2+cu121:
  Successfully uninstalled torchaudio-2.2.2+cu121
Found existing installation: bitsandbytes 0.43.1
Uninstalling bitsandbytes-0.43.1:
  Successfully uninstalled bitsandbytes-0.43.1
Found existing installation: triton 2.2.0
Uninstalling triton-2.2.0:
  Successfully uninstalled triton-2.2.0
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.2.2
  Using cached https://download.pytorch.org/whl/cu121/torch-2.2.2%2Bcu121-cp312-cp312-linux_x86_64.whl (757.2 MB)
Collecting torchvision==0.17.2
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.17.2%2Bcu121-cp312-cp312-linux_x86_64.whl (7.0 MB)
Collec

In [None]:
import torch, bitsandbytes as bnb

print("Torch:", torch.__version__, "CUDA:", torch.version.cuda, "CUDA available:", torch.cuda.is_available())
print("bnb version:", bnb.__version__)

lin4 = bnb.nn.Linear4bit(128, 64, bias=True, quant_type="nf4").cuda()
x = torch.randn(2,128, device="cuda")
_ = lin4(x)

print("✅ 4-bit kernel executed on CUDA")


Torch: 2.2.2+cu121 CUDA: 12.1 CUDA available: True
bnb version: 0.43.1
✅ 4-bit kernel executed on CUDA


In [None]:
!pip install transformers==4.43.3 datasets==2.20.0 accelerate==0.33.0 \
    peft==0.12.0 trl==0.9.6 evaluate scikit-learn




In [None]:
import os, random, numpy as np, torch
from datasets import load_dataset
from sklearn.metrics import f1_score, hamming_loss

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model

In [None]:
SEED        = 42
MODEL_ID    = "bert-base-uncased"
OUTPUT_DIR  = "bert-goemotions-qlora"
EPOCHS      = 1
BATCH_SIZE  = 8
GR_ACCUM    = 1
LR          = 2e-5
MAX_LEN     = 128
NUM_LABELS  = 28  # GoEmotions

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
os.environ["WANDB_DISABLED"] = "true"   # avoid hangs

In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
ds = load_dataset("go_emotions")

def one_hot_labels(example):
    y = np.zeros(NUM_LABELS, dtype=np.float32)  # float32 (not int!)
    for lbl in example["labels"]:
        y[lbl] = 1.0
    example["labels"] = y.tolist()
    return example

ds = ds.map(one_hot_labels)

def tokenize(batch):
    return tok(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

ds = ds.map(tokenize, batched=True)
# keep only what Trainer needs; set tensors later in the collator
ds = ds.remove_columns([c for c in ds["train"].column_names if c not in {"input_ids","attention_mask","labels"}])

# train/val split
split = ds["train"].train_test_split(test_size=0.05, seed=SEED)
train_ds, val_ds = split["train"], split["test"]


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/350k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [None]:
# --- hard reset the old model to avoid lingering fp16 params
import gc, torch, os
try:
    del model
except NameError:
    pass
gc.collect()
torch.cuda.empty_cache()

os.environ["ACCELERATE_MIXED_PRECISION"] = "no"   # be explicit: no fp16/bf16

from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

NUM_LABELS = 28
MODEL_ID   = "bert-base-uncased"

# 4-bit base, but do all math in FP32 for stability on T4
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32,   # <<< FP32 compute
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
    quantization_config=bnb_config,
    torch_dtype=torch.float32,              # <<< head & adapters in FP32
)

# freeze encoder; train LoRA + classifier
for p in model.bert.parameters():
    p.requires_grad = False

peft_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["query","key","value","dense"],
    task_type="SEQ_CLS",
)
model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()


`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 2,700,316 || all params: 112,204,088 || trainable%: 2.4066


In [None]:
class FloatLabelCollator(DataCollatorWithPadding):
    def __call__(self, features):
        # cast lists to tensors with correct dtype
        batch = super().__call__(features)
        # ensure labels are float32 for BCEWithLogits
        batch["labels"] = batch["labels"].to(torch.float32)
        return batch

collator = FloatLabelCollator(tokenizer=tok)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # logits: (bs, 28); labels: (bs, 28) float
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    labels = labels
    return {
        "f1_micro":      f1_score(labels, preds, average="micro", zero_division=0),
        "f1_macro":      f1_score(labels, preds, average="macro", zero_division=0),
        "hamming_loss":  hamming_loss(labels, preds),
    }


In [None]:
b = collator([train_ds[i] for i in range(2)])
print(b["input_ids"].dtype, b["labels"].dtype)  # should be torch.int64 and torch.float32


torch.int64 torch.float32


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GR_ACCUM,
    num_train_epochs=EPOCHS,
    logging_steps=20,
    report_to="none",
    fp16=False,                 # stay off fp16
    bf16=False,                 # T4 can’t bf16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tok,
    data_collator=collator,     # keeps labels float32
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tok.save_pretrained(OUTPUT_DIR)




Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,0.1426,0.132272,0.230169,0.047081,0.038955,19.1416,113.418,14.21


('bert-goemotions-qlora/tokenizer_config.json',
 'bert-goemotions-qlora/special_tokens_map.json',
 'bert-goemotions-qlora/vocab.txt',
 'bert-goemotions-qlora/added_tokens.json',
 'bert-goemotions-qlora/tokenizer.json')

In [None]:
import json, os
label_names = ds["train"].features["labels"].feature.names  # from earlier
with open(os.path.join("bert-goemotions-qlora", "labels.json"), "w") as f:
    json.dump(label_names, f, indent=2)
print(f"✅ Adapter saved to: {OUTPUT_DIR}")

✅ Adapter saved to: bert-goemotions-qlora


In [None]:
# -------------------------
# 2) Export a MERGED full model (base + LoRA merged + classifier head)
# -------------------------
# We reload the base in FP16/FP32 (NOT 4-bit) -> attach the adapter -> merge -> save.
from transformers import AutoModelForSequenceClassification
from peft import PeftModel

FULL_DIR = OUTPUT_DIR + "-merged"

# Reload a non-quantized base model so we can merge LoRA weights into it
base_fp16 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
    torch_dtype=torch.float16
)

# Attach the trained adapter
merged = PeftModel.from_pretrained(base_fp16, OUTPUT_DIR)

# Merge adapter weights into the base weights (includes your fine-tuned classifier head)
merged = merged.merge_and_unload()   # after this, it's a plain Transformers model

# Save the merged FULL model + tokenizer + labels
merged.save_pretrained(FULL_DIR)
tok.save_pretrained(FULL_DIR)
with open(os.path.join(FULL_DIR, "labels.json"), "w") as f:
    json.dump(label_names, f, indent=2)

print(f"✅ Merged full model saved to: {FULL_DIR}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Merged full model saved to: bert-goemotions-qlora-merged


In [None]:
!zip -r bert-goemotions-qlora_adapter.zip {OUTPUT_DIR}
!zip -r bert-goemotions-qlora_merged.zip {FULL_DIR}

from google.colab import files
files.download("bert-goemotions-qlora_adapter.zip")
files.download("bert-goemotions-qlora_merged.zip")

  adding: bert-goemotions-qlora/ (stored 0%)
  adding: bert-goemotions-qlora/checkpoint-5155/ (stored 0%)
  adding: bert-goemotions-qlora/checkpoint-5155/scheduler.pt (deflated 56%)
  adding: bert-goemotions-qlora/checkpoint-5155/trainer_state.json (deflated 81%)
  adding: bert-goemotions-qlora/checkpoint-5155/adapter_model.safetensors (deflated 7%)
  adding: bert-goemotions-qlora/checkpoint-5155/vocab.txt (deflated 53%)
  adding: bert-goemotions-qlora/checkpoint-5155/adapter_config.json (deflated 52%)
  adding: bert-goemotions-qlora/checkpoint-5155/special_tokens_map.json (deflated 42%)
  adding: bert-goemotions-qlora/checkpoint-5155/rng_state.pth (deflated 25%)
  adding: bert-goemotions-qlora/checkpoint-5155/tokenizer.json (deflated 71%)
  adding: bert-goemotions-qlora/checkpoint-5155/README.md (deflated 66%)
  adding: bert-goemotions-qlora/checkpoint-5155/tokenizer_config.json (deflated 76%)
  adding: bert-goemotions-qlora/checkpoint-5155/optimizer.pt (deflated 8%)
  adding: bert-go

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MERGED_DIR = "bert-goemotions-qlora-merged"

# Device & dtype
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype  = torch.float16 if device.type == "cuda" else torch.float32  # FP16 on GPU, FP32 on CPU

# Load
tok = AutoTokenizer.from_pretrained(MERGED_DIR, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    MERGED_DIR,
    torch_dtype=dtype,
).to(device).eval()

labels = json.load(open(f"{MERGED_DIR}/labels.json"))

@torch.inference_mode()
def predict(text, max_length=128, top_k=5):
    enc = tok(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    enc = {k: v.to(device) for k, v in enc.items()}       # <-- move inputs to same device
    logits = model(**enc).logits
    probs = torch.sigmoid(logits).squeeze(0).float().cpu().tolist()
    return sorted(zip(labels, probs), key=lambda x: -x[1])[:top_k]

print(predict("i'm thrilled but a bit anxious about tomorrow"))


[('admiration', 0.311279296875), ('gratitude', 0.2294921875), ('love', 0.128173828125), ('joy', 0.07598876953125), ('optimism', 0.06597900390625)]
