# Step-by-step Plan

In [1]:
!pip install -q \
    "torch==2.1.2" \
    "transformers==4.39.3" \
    "datasets==2.14.6" \
    "torchmetrics==1.3.1" \
    "fsspec==2023.10.0"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m123.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Step 2: Load and Tokenize the Dataset

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the simplified version of the GoEmotions dataset
dataset = load_dataset("go_emotions", "simplified")

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenizer
tokenized = dataset.map(tokenize, batched=True)

# Format for PyTorch
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

# Step 3: Define the Model

In [3]:
import torch
from torch import nn
from transformers import DistilBertModel

# Number of classes in GoEmotions simplified version
NUM_CLASSES = 28

class DistilBertMultiLabel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, NUM_CLASSES)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

# Step 4: Create DataLoaders (for training & validation)

In [4]:
from torch.utils.data import DataLoader

# Split into train and validation
train_data = tokenized["train"]
val_data   = tokenized["validation"]

# Create DataLoaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=16)

# Step 5: Initialize Model, Loss, Optimizer, and Metric

In [5]:
import torch.optim as optim
from torch.nn import BCEWithLogitsLoss
from transformers import logging
import torchmetrics

logging.set_verbosity_error()  # silence warning spam

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertMultiLabel().to(device)

# For multi-label classification
criterion = BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# F1 score metric
f1_metric = torchmetrics.classification.MultilabelF1Score(
    num_labels=28, average="weighted").to(device)



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# Create a one‑hot helper and custom collate

In [11]:
tokenized.reset_format()          # remove Torch formatting completely

In [12]:
from torch.utils.data import Dataset
import torch

NUM_CLASSES = 28  # simplified GoEmotions

class GoEmotionDataset(Dataset):
    def __init__(self, hf_ds):
        self.ds = hf_ds

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        ex = self.ds[idx]
        # convert input_ids & mask to torch tensors
        item = {
            "input_ids": torch.tensor(ex["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(ex["attention_mask"], dtype=torch.long),
            "labels": ex["labels"]           # keep as python list for now
        }
        return item

# Rebuild DataLoaders with that collate_fn


In [13]:
def multi_hot(label_list):
    vec = torch.zeros(NUM_CLASSES, dtype=torch.float32)
    vec[label_list] = 1.0
    return vec

def collate_fn(batch):
    ids   = torch.stack([b["input_ids"] for b in batch])
    mask  = torch.stack([b["attention_mask"] for b in batch])
    labels = torch.stack([multi_hot(b["labels"]) for b in batch])
    return {"input_ids": ids, "attention_mask": mask, "labels": labels}

# Re‑run the training loop

In [14]:
train_loader = DataLoader(
    GoEmotionDataset(tokenized["train"]), batch_size=16, shuffle=True, collate_fn=collate_fn)

val_loader = DataLoader(
    GoEmotionDataset(tokenized["validation"]), batch_size=32, shuffle=False, collate_fn=collate_fn)

# Run the same training loop again

In [15]:
for epoch in range(1):
    model.train(); total = 0
    for batch in train_loader:
        ids   = batch["input_ids"].to(device)
        mask  = batch["attention_mask"].to(device)
        y     = batch["labels"].to(device)

        optimizer.zero_grad()
        loss = criterion(model(ids, mask), y)
        loss.backward()
        optimizer.step()
        total += loss.item()

    print(f"Train loss: {total/len(train_loader):.4f}")

    model.eval(); f1_metric.reset()
    with torch.no_grad():
        for batch in val_loader:
            ids  = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            y    = batch["labels"].to(device)
            preds = (torch.sigmoid(model(ids, mask)) > 0.5).int()
            f1_metric.update(preds, y.int())

    print("Val F1:", f1_metric.compute().item())

Train loss: 0.1166
Val F1: 0.4909525513648987
