### Transformer Model Pipeline

#### Imports

In [1]:
# System dependencies
import os
import sys
import re
import math

# Torch dependencies
import torch
import torch.nn as nn

# Data loaders
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

# Utils
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


#### Config

In [2]:
CONFIG = {
    ### Data config
    "num_classes": 4,
    
    ### NLP config
    "max_vocab_size": 20000,
    "pad_token": "<pad>",
    "unk_token": "<unk>",
    "max_seq_len": 128,
    
    ### system settings
    "seed": 42,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

In [3]:
def setup():
    torch.manual_seed(CONFIG["seed"])
    
setup()

#### Dataset

In [5]:
dataset = load_dataset("ag_news")

train_data = dataset["train"]
test_data = dataset["test"]

label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

#### Natural Language Processing

In [6]:
### Tokenizer

def tokenize(text):
    # text.lower() - lowercase all the text samples
    # text.split() - split the sentence into words
    return text.lower().split() # ["word1", "word2", "word3"]


In [7]:
train_data[0]['text'] # sentence

"Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [8]:
tokenize(train_data[0]['text']) # list of words

['wall',
 'st.',
 'bears',
 'claw',
 'back',
 'into',
 'the',
 'black',
 '(reuters)',
 'reuters',
 '-',
 'short-sellers,',
 'wall',
 "street's",
 'dwindling\\band',
 'of',
 'ultra-cynics,',
 'are',
 'seeing',
 'green',
 'again.']

#### Building Vocabulary

In [9]:
counter = Counter() # map each word to its number of appearances

for item in train_data:
    counter.update(tokenize(item["text"]))

In [10]:
# index, (value, key)
for i, (word, num) in enumerate(counter.items()):
    print(f"{word} : {num}")
    
    if i > 5:
        break

wall : 1375
st. : 1192
bears : 344
claw : 17
back : 3868
into : 6628
the : 203234


In [11]:
len(counter)

158733

In [12]:
CONFIG["max_vocab_size"]

20000

In [13]:
# building the vocab based on MAX_VOCAB_SIZE
most_common = counter.most_common(CONFIG["max_vocab_size"] - 2) # 2 custom vocab - PAD and UNK

len(most_common)

19998

In [14]:
CONFIG["pad_token"], CONFIG["unk_token"]

('<pad>', '<unk>')

In [15]:
vocab = {
    CONFIG["pad_token"]: 0, # to pad all the sentences to the same length
    CONFIG["unk_token"]: 1  # for the unknown tokens
}

In [16]:
# Make up the vocabulary

for i, (word, _) in enumerate(most_common, start=2):
    vocab[word] = i


In [17]:
len(vocab) == CONFIG["max_vocab_size"]

True

#### Numericalization and Padding

In [19]:
def encode(text):
    tokens = tokenize(text)

    ids = [vocab.get(tok, vocab[CONFIG["unk_token"]]) for tok in tokens]

    # Padding or truncation
    if len(ids) < CONFIG["max_seq_len"]:
        ids += [vocab[CONFIG["pad_token"]]] * (CONFIG["max_seq_len"] - len(ids))
    else:
        # truncate it
        ids = ids[:CONFIG["max_seq_len"]]

    return torch.tensor(ids)

#### PyTorch Dataset

In [20]:
class AGNewsDataset(Dataset):
    """
    x: Tensor of shape (MAX_SEQ_LEN, )
    y: int (0-3) -> 4 classes
    """
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        x = encode(item["text"])
        y = item["label"]
        
        return x, y

In [21]:
### Dataloader


train_dataset = AGNewsDataset(train_data)

# batching the dataset
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

#### Positional Encoding

* Transformers are parallel processing, so we use positional encoding
* Why not integers, 0 to n:
  * Scale: What if indices go very long, it will destabilize NN
  * Generalization: Model trained on sequences of len 50 won't know what to do with sequences of length 500


- 128 - maximum length of the vector
- 512 - embedding size

In [25]:
class PositionalEncoding(nn.Module):
    """
    Sinusoidal positional encoding.
    Injects word order information into embeddings coz transformer process everything in parallel.
    """
    
    def __init__(self, d_model, max_len):
        # d_model : total dim of the embedding
        super().__init__()
        
        # position of each vector 
        pe = torch.zeros(max_len, d_model) # (max_seq_len, embedding_dim)
        
        position = torch.arange(0, max_len).unsqueeze(1)
        
        # sine frequencies .. it generates unique value for all the tokens
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model) # it's a formula
        )
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer("pe", pe)
    
    def forward(self, x):
        return x + self.pe[:x.size(1)]

#### Self attention

In [None]:
class SelfAttention(nn.Module):
    # For this word, which other words matter, and how much?
    
    """
    Every word in a sentence, 
        * Looks at every other word
        * Scores their relevance
        * Collects useful information
    """
    
    def __init__(self, d_model):
        super().__init__()
        
        """
        (Q) Query   :  What am I looking for?       - search for one thing
        (K) Key     :  What do I contain?           - match based on another key
        (V) Value   :  What information do I give?  - extract something else
        """
        
        # learnable linear layer
        self.q = nn.Linear(d_model, d_model)
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)
        
        self.scale = math.sqrt(d_model)
    
    def forward(self, x): # x - input
        
        # for each input
        Q = self.q(x)
        K = self.k(x)
        V = self.v(x)
        
        # Attention scores: Q * K^T / sqrt(d_model)
        scores = torch.matmul(Q, K.transpose(-2 -1)) / self.scale
        
        weights = torch.softmax(scores, dim=-1)
        
        return torch.matmul(weights, V)
        

In [None]:
class TransformerBlock(nn.Module):
    
    # constructor
    def __init__(self, d_model, ff_dim):
        super().__init__()
        
        # self attention layer
        self.attention = SelfAttention(d_model) # give the weights each and every token
        
        # layer normalization after attention
        self.norm1 = nn.LayerNorm(d_model) # stability during training
        
        # learning process
        # position-wise feed-forward network
        self.ff = nn.Sequential(            # MLP - Multi Layer Perceptron
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),  # regularization .. max(0, n)
            nn.Linear(ff_dim, d_model)
        )
        
        # layer normalization after feed forward
        self.norm2 = nn.LayerNorm(d_model)
            
    def forward(self, x): # x - input
        attn_out = self.attention(x)
        x = self.norm1(x + attn_out) # with residual connection
        
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        
        return x

In [59]:
class TransformerClassifier(nn.Module):
    """
    Full transformer-based text classifier
    """
    def __init__(self, vocab_size, d_model, ff_dim, num_layers, num_classes):
        super().__init__()
        
        # Token embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)
        