In [8]:
import warnings
warnings.filterwarnings("ignore")

import torch
import lovely_tensors as lt
lt.monkey_patch()

from transformers import DistilBertTokenizer, DistilBertModel
from datasets import load_dataset

In [2]:
model_name = "distilbert/distilbert-base-uncased"

tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [3]:
model.transformer.layer[-1].output_layer_norm

LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [4]:
model.config.hidden_size

768

# Download the dataset

In [9]:
data_hf = load_dataset("google-research-datasets/go_emotions")
data_hf

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [10]:
labels = data_hf['train'].features['labels'].feature.names
print(len(labels))
print({i:l for i, l in enumerate(labels)})

28
{0: 'admiration', 1: 'amusement', 2: 'anger', 3: 'annoyance', 4: 'approval', 5: 'caring', 6: 'confusion', 7: 'curiosity', 8: 'desire', 9: 'disappointment', 10: 'disapproval', 11: 'disgust', 12: 'embarrassment', 13: 'excitement', 14: 'fear', 15: 'gratitude', 16: 'grief', 17: 'joy', 18: 'love', 19: 'nervousness', 20: 'optimism', 21: 'pride', 22: 'realization', 23: 'relief', 24: 'remorse', 25: 'sadness', 26: 'surprise', 27: 'neutral'}


There are approx 43k rows in train set and 5k rows in each of validation set and test set. There are 28 labels in our dataset.

In [11]:
data_hf['train'].to_pandas()

Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj
3,To make her feel threatened,[14],ed7ypvh
4,Dirty Southern Wankers,[3],ed0bdzj
...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,[18],edsb738
43406,Always thought that was funny but is it a refe...,[6],ee7fdou
43407,What are you talking about? Anything bad that ...,[3],efgbhks
43408,"More like a baptism, with sexy results!",[13],ed1naf8


In [12]:
train_y = [x[0] for x in data_hf['train']['labels']]
val_y = [x[0] for x in data_hf['validation']['labels']]
test_y = [x[0] for x in data_hf['test']['labels']]

In [13]:
# Text preprocessing and tokenization
from collections import Counter
import numpy as np

class TextDataProcessor:
    def __init__(self, vocab_size=10000, max_length=128):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.vocab_built = False
        
    def build_vocab(self, texts):
        """Build vocabulary from training texts"""
        word_counts = Counter()
        for text in texts:
            words = text.lower().split()
            word_counts.update(words)
        
        # Keep most frequent words
        most_common = word_counts.most_common(self.vocab_size - 2)  # -2 for PAD and UNK
        
        # Build mappings
        self.word_to_idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx_to_word = {0: '<PAD>', 1: '<UNK>'}
        
        for i, (word, _) in enumerate(most_common, start=2):
            self.word_to_idx[word] = i
            self.idx_to_word[i] = word
            
        self.vocab_built = True
        
    def text_to_indices(self, text):
        """Convert text to sequence of indices"""
        words = text.lower().split()
        indices = []
        for word in words:
            idx = self.word_to_idx.get(word, 1)  # 1 is UNK token
            indices.append(idx)
        
        # Pad or truncate to max_length
        if len(indices) < self.max_length:
            indices.extend([0] * (self.max_length - len(indices)))
        else:
            indices = indices[:self.max_length]
            
        return indices

# Process the go_emotions dataset
processor = TextDataProcessor(vocab_size=5000, max_length=16)

# Extract texts from training set
train_texts = [example['text'] for example in data_hf['train']]
processor.build_vocab(train_texts)

print(f"Vocabulary size: {len(processor.word_to_idx)}")
print(f"Sample vocab: {list(processor.word_to_idx.items())[:10]}")

# Convert texts to indices
train_sequences = [processor.text_to_indices(text) for text in train_texts]
val_sequences = [processor.text_to_indices(text) for text in data_hf['validation']['text']]
test_sequences = [processor.text_to_indices(text) for text in data_hf['test']['text']]

# Convert to tensors
train_X = torch.tensor(train_sequences)
val_X = torch.tensor(val_sequences)
test_X = torch.tensor(test_sequences)

# Get labels (multi-label classification)
train_y = torch.tensor(train_y)
val_y = torch.tensor(val_y)
test_y = torch.tensor(test_y)

print(f"Train shape: {train_X.shape}, {train_y.shape}")
print(f"Val shape: {val_X.shape}, {val_y.shape}")
print(f"Test shape: {test_X.shape}, {test_y.shape}")

Vocabulary size: 5000
Sample vocab: [('<PAD>', 0), ('<UNK>', 1), ('the', 2), ('i', 3), ('to', 4), ('a', 5), ('and', 6), ('you', 7), ('is', 8), ('that', 9)]
Train shape: torch.Size([43410, 16]), torch.Size([43410])
Val shape: torch.Size([5426, 16]), torch.Size([5426])
Test shape: torch.Size([5427, 16]), torch.Size([5427])


The training dataset is a large matrix - 43410x16 in size. We will have to take small sets of data (batches) onto the GPU to train the model. Let's code a dataloader now.

In [75]:
train_sequences

[[17, 1225, 708, 8, 225, 3, 133, 24, 4, 1, 1323, 0, 0, 0, 0, 0],
 [95, 32, 30, 164, 166, 1, 197, 64, 55, 1226, 238, 5, 571, 4177, 27, 49],
 [84, 2, 203, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [4, 102, 68, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1877, 3333, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [418, 1, 222, 50, 271, 4, 235, 172, 14, 2, 1, 2035, 4575, 659, 1, 4576],
 [303, 3, 316, 1, 2, 2187, 1, 9, 65, 4, 19, 2525, 89, 15, 29, 1],
 [43, 123, 61, 1, 6, 4, 1501, 5, 346, 61, 1129, 15, 101, 110, 2960, 19],
 [211, 808, 6, 1, 2658, 8, 361, 1, 15, 420, 0, 0, 0, 0, 0, 0],
 [11, 212, 19, 2806, 4, 2, 797, 3570, 10, 29, 740, 0, 0, 0, 0, 0],
 [1, 3, 114, 62, 1756, 477, 3334, 73, 65, 1, 3127, 0, 0, 0, 0, 0],
 [1, 2280, 165, 215, 244, 1, 37, 146, 57, 16, 21, 1257, 10, 1561, 3, 776],
 [1953, 1878, 260, 52, 1, 34, 413, 45, 301, 6, 1703, 14, 1162, 32, 1, 0],
 [1, 212, 19, 91, 15, 66, 1205, 1, 153, 5, 148, 6, 202, 4, 107, 59],
 [13, 216, 131, 205, 3335, 1206, 135, 1, 10, 2, 1428, 1, 0, 0, 0, 

In [14]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 8

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=False, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0)

In [15]:
for batch_x, batch_y in train_dataloader:
    print(f"Batch input shape: {batch_x.shape}")
    print(f"Batch target shape: {batch_y.shape}")
    print(f"Target values:", batch_y)
    break

Batch input shape: torch.Size([8, 16])
Batch target shape: torch.Size([8])
Target values: tensor[8] i64 x∈[2, 27] μ=15.250 σ=10.498 [27, 27, 2, 14, 3, 26, 15, 8]


In [26]:
class DistilBertModelWithClassifierHead(torch.nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.backbone = DistilBertModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        self.head = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, num_classes),
            torch.nn.Dropout(0.5)
        )
    
    def forward(self, x):
        # x shape: (batch_size, seq_len)
        backbone_out = self.backbone(x) # x shape: (batch_size, seq_len, hidden_dim)
        print(type(backbone_out))
        x = self.head(backbone_out) # x shape: (batch_size, num_classes)
        return x, backbone_out
    
clf = DistilBertModelWithClassifierHead(num_classes=28)


In [27]:
a, b = clf(batch_x)

<class 'transformers.modeling_outputs.BaseModelOutput'>


TypeError: linear(): argument 'input' (position 1) must be Tensor, not BaseModelOutput

We see that the output from the backbone model is a BaseModelOutput. So, somewhere there is a type mismatch. Let's run a forward pass on just the backbone model and go from there.

In [32]:
o = model(batch_x)
o

BaseModelOutput(last_hidden_state=tensor[8, 16, 768] n=98304 (0.4Mb) x∈[-8.876, 1.564] μ=-0.009 σ=0.370 grad NativeLayerNormBackward0, hidden_states=None, attentions=None)

Good! Let's now inspect this model output.

In [33]:
type(o)

transformers.modeling_outputs.BaseModelOutput

In [35]:
dir(o)

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__post_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'attentions',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'last_hidden_state',
 'move_to_end',
 'pop',
 'popitem',
 'setdefault',
 'to_tuple',
 'update',
 'values']

In [39]:
[l for l in dir(o) if not "__" in l]

['attentions',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'last_hidden_state',
 'move_to_end',
 'pop',
 'popitem',
 'setdefault',
 'to_tuple',
 'update',
 'values']

In [40]:
o.last_hidden_state

tensor[8, 16, 768] n=98304 (0.4Mb) x∈[-8.876, 1.564] μ=-0.009 σ=0.370 grad NativeLayerNormBackward0

In [41]:
o.last_hidden_state.v

tensor[8, 16, 768] n=98304 (0.4Mb) x∈[-8.876, 1.564] μ=-0.009 σ=0.370 grad NativeLayerNormBackward0
tensor([[[ 0.2518, -0.1572,  0.3614,  ..., -0.2005,  0.3150, -0.1986],
         [ 0.2131, -0.0846,  0.3742,  ..., -0.1642,  0.2788, -0.2284],
         [ 0.2155, -0.1017,  0.3553,  ..., -0.1683,  0.2629, -0.2301],
         ...,
         [ 0.2889, -0.1330,  0.3433,  ..., -0.2044,  0.3112, -0.2525],
         [ 0.2849, -0.1401,  0.3483,  ..., -0.2058,  0.3120, -0.2494],
         [ 0.2838, -0.1465,  0.3528,  ..., -0.2014,  0.3157, -0.2536]],

        [[ 0.3036, -0.1858,  0.4567,  ..., -0.0869,  0.2616, -0.3123],
         [ 0.2790, -0.1539,  0.4225,  ..., -0.0700,  0.2388, -0.3561],
         [ 0.2912, -0.1608,  0.4154,  ..., -0.0768,  0.2270, -0.3490],
         ...,
         [ 0.3035, -0.1246,  0.4657,  ..., -0.0595,  0.2826, -0.4743],
         [ 0.3187, -0.0726,  0.4474,  ..., -0.0863,  0.3024, -0.3676],
         [ 0.2915, -0.1280,  0.4304,  ..., -0.0897,  0.2905, -0.3554]],

        [[ 0.243

So, this is hidden state variable we must use for classification!!

In [None]:
class DistilBertModelWithClassifierHead(torch.nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.backbone = DistilBertModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        self.head = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, num_classes),
            torch.nn.Dropout(0.5)
        )
    
    def forward(self, x):
        # x shape: (batch_size, seq_len)
        backbone_out = self.backbone(x) 
        hidden_state = backbone_out.last_hidden_state # x shape: (batch_size, seq_len, hidden_dim)
        x = self.head(backbone_out) # x shape: (batch_size, num_classes)
        return x, backbone_out
    
clf = DistilBertModelWithClassifierHead(num_classes=28)


Before we run this cell, the hidden state is of dimensions `(batch_size, seq_len, hidden_dim)` and our linear head takes `(batch_size, num_classes)`.

That means, if we run this code as-is, the matrix math would fail. There is a reduction operation that needs to happen prior to calling the classification head.

In [None]:
o.last_hidden_state[:, 0, :]

tensor[8, 768] n=6144 (24Kb) x∈[-4.847, 1.355] μ=-0.009 σ=0.366 grad SliceBackward0

In [44]:
o.last_hidden_state.v

tensor[8, 16, 768] n=98304 (0.4Mb) x∈[-8.876, 1.564] μ=-0.009 σ=0.370 grad NativeLayerNormBackward0
tensor([[[ 0.2518, -0.1572,  0.3614,  ..., -0.2005,  0.3150, -0.1986],
         [ 0.2131, -0.0846,  0.3742,  ..., -0.1642,  0.2788, -0.2284],
         [ 0.2155, -0.1017,  0.3553,  ..., -0.1683,  0.2629, -0.2301],
         ...,
         [ 0.2889, -0.1330,  0.3433,  ..., -0.2044,  0.3112, -0.2525],
         [ 0.2849, -0.1401,  0.3483,  ..., -0.2058,  0.3120, -0.2494],
         [ 0.2838, -0.1465,  0.3528,  ..., -0.2014,  0.3157, -0.2536]],

        [[ 0.3036, -0.1858,  0.4567,  ..., -0.0869,  0.2616, -0.3123],
         [ 0.2790, -0.1539,  0.4225,  ..., -0.0700,  0.2388, -0.3561],
         [ 0.2912, -0.1608,  0.4154,  ..., -0.0768,  0.2270, -0.3490],
         ...,
         [ 0.3035, -0.1246,  0.4657,  ..., -0.0595,  0.2826, -0.4743],
         [ 0.3187, -0.0726,  0.4474,  ..., -0.0863,  0.3024, -0.3676],
         [ 0.2915, -0.1280,  0.4304,  ..., -0.0897,  0.2905, -0.3554]],

        [[ 0.243

In [45]:
o.last_hidden_state[:,0,:].v

tensor[8, 768] n=6144 (24Kb) x∈[-4.847, 1.355] μ=-0.009 σ=0.366 grad SliceBackward0
tensor([[ 0.2518, -0.1572,  0.3614,  ..., -0.2005,  0.3150, -0.1986],
        [ 0.3036, -0.1858,  0.4567,  ..., -0.0869,  0.2616, -0.3123],
        [ 0.2434, -0.1905,  0.3326,  ..., -0.2758,  0.3613, -0.2870],
        ...,
        [ 0.2639, -0.3739,  0.5197,  ..., -0.0170,  0.3678, -0.3893],
        [ 0.3327, -0.2146,  0.4847,  ..., -0.0499,  0.3594, -0.3761],
        [ 0.2132, -0.5592,  0.5877,  ..., -0.0119,  0.5787, -0.3653]],
       grad_fn=<SliceBackward0>)

As we want a representation of all the tokens for classification, the first token's (`[CLS]`) hidden state is usually considered for classification.

In [52]:
class DistilBertModelWithClassifierHead(torch.nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.backbone = DistilBertModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        self.head = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, num_classes),
            torch.nn.Dropout(0.5)
        )
    
    def forward(self, x):
        # x shape: (batch_size, seq_len)
        backbone_out = self.backbone(x) 
        hidden_state = backbone_out.last_hidden_state # hidden_state shape: (batch_size, seq_len, hidden_dim)
        hidden_state_cls = hidden_state[:,0,:] # hidden_state_cls shape: (batch_size, hidden_dim)
        x = self.head(hidden_state_cls) # x shape: (batch_size, num_classes)
        return x
    
clf = DistilBertModelWithClassifierHead(num_classes=28)


In [55]:
o = clf(batch_x)
o

tensor[8, 28] n=224 x∈[-1.086, 1.051] μ=-0.010 σ=0.313 grad MulBackward0

In [61]:
torch.softmax(o, dim=1).v

tensor[8, 28] n=224 x∈[0.012, 0.102] μ=0.036 σ=0.013 grad SoftmaxBackward0
tensor([[0.0288, 0.0547, 0.0356, 0.0356, 0.0221, 0.0160, 0.0356, 0.0306, 0.0342,
         0.0356, 0.0356, 0.0532, 0.0356, 0.0356, 0.0356, 0.0519, 0.0356, 0.0226,
         0.0845, 0.0356, 0.0127, 0.0356, 0.0223, 0.0302, 0.0356, 0.0356, 0.0383,
         0.0356],
        [0.0341, 0.0589, 0.0358, 0.0358, 0.0234, 0.0174, 0.0307, 0.0358, 0.0391,
         0.0358, 0.0358, 0.0358, 0.0358, 0.0286, 0.0358, 0.0358, 0.0387, 0.0259,
         0.1023, 0.0322, 0.0121, 0.0358, 0.0358, 0.0358, 0.0201, 0.0358, 0.0358,
         0.0358],
        [0.0347, 0.0347, 0.0347, 0.0465, 0.0227, 0.0155, 0.0347, 0.0303, 0.0346,
         0.0347, 0.0376, 0.0347, 0.0347, 0.0347, 0.0347, 0.0487, 0.0347, 0.0244,
         0.0807, 0.0332, 0.0347, 0.0347, 0.0219, 0.0280, 0.0347, 0.0530, 0.0374,
         0.0347],
        [0.0474, 0.0339, 0.0339, 0.0617, 0.0241, 0.0339, 0.0403, 0.0339, 0.0360,
         0.0295, 0.0339, 0.0630, 0.0316, 0.0339, 0.0339, 0.03

Now, we see probabilities! Yay! Now that we have prototyped the forward pass, let's go ahead and evaluate the accuracy.

# Performance evaluation on validation set

In [68]:
all_val_preds = []
with torch.no_grad():
    for batch_x, batch_y in val_dataloader:
        val_preds = clf(batch_x)
        all_val_preds.append(val_preds)
        
val_classes = torch.cat(all_val_preds).softmax(dim=1).argmax(dim=1)
val_matches = val_classes == val_y
val_accuracy = val_matches.sum() / len(val_matches)

print(
    f" val_accuracy: {val_accuracy: .3f}"
)

 val_accuracy:  0.031


the validation accuracy is poor! Hmm. This is pretty bad and unexpected. What can be wrong?

I think the tokens are incorrectly mapped to tokenIDs. The tokenizer! ah.

As this is a pretrained model, all the words in our text have to be mapped to the model's training vocab.

The mistake here is I used my own tokenizer instead of using distillbert's tokenizer. Let's change that.

## Fix the tokenizer to use distillbert's tokenizer

In [72]:
# Convert train_texts and validation texts to DistilBERT token IDs using the pretrained tokenizer
max_length = 16

train_encodings = tokenizer(
    train_texts,
    padding='max_length',
    truncation=True,
    max_length=max_length,
    return_tensors='pt'
)
val_encodings = tokenizer(
    [example['text'] for example in data_hf['validation']],
    padding='max_length',
    truncation=True,
    max_length=max_length,
    return_tensors='pt'
)

train_X = train_encodings['input_ids']
val_X = val_encodings['input_ids']

print(f"train_X shape: {train_X.shape}")
print(f"val_X shape: {val_X.shape}")

train_X shape: torch.Size([43410, 16])
val_X shape: torch.Size([5426, 16])


In [78]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [79]:
all_val_preds = []
with torch.no_grad():
    for batch_x, batch_y in val_dataloader:
        val_preds = clf(batch_x)
        all_val_preds.append(val_preds)
        
val_classes = torch.cat(all_val_preds).softmax(dim=1).argmax(dim=1)
val_matches = val_classes == val_y
val_accuracy = val_matches.sum() / len(val_matches)

print(
    f" val_accuracy: {val_accuracy: .3f}"
)

 val_accuracy:  0.033


There is something else that is faulty. Gotta fix it!

Maybe inspect the tokenizer :)