### Transformer Model Pipeline

#### Imports

In [None]:
# System dependencies
import os   # accessing the system
import sys  # interacting with python's internals
import re   # pattern matching
import math

# Torch dependencies
import torch
import torch.nn as nn

# Data loaders
from torch.utils.data import DataLoader, Dataset  # Batch generator and Data interface
from datasets import load_dataset

# Utils
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter     # frequency counter counts things efficiently

In [41]:
os.getcwd()

'/home/love/FieldOfInterest/PersonalWork/XDocIntent/notebooks'

#### Config

* PyTorch uses randomness for weight initialization, data shuffling, dropout, and augmentations, causing different results in each run without a seed.

In [None]:
CONFIG = {
    ### Data config
    "num_classes": 4,
    
    ### NLP config
    "max_vocab_size": 20000,    # Maximum number of words 
    "pad_token": "<pad>",       # padding the smaller sentences
    "unk_token": "<unk>",       # replacement for the unknown words
    "max_seq_len": 128,         # maximum length of the sequence in the dataset 
    
    ### system settings
    "seed": 42,     # starting number given to the random number generator - reproducibility
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

* Pseudo-Random-Number Generator (PRNG) for randomization

In [5]:
def setup():
    torch.manual_seed(CONFIG["seed"])
    
setup()

#### Dataset

In [6]:
dataset = load_dataset("ag_news")

train_data = dataset["train"]
test_data = dataset["test"]

label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

#### Natural Language Processing

In [7]:
### Tokenizer

def tokenize(text):
    # text.lower() - lowercase all the text samples
    # text.split() - split the sentence into words
    return text.lower().split() # ["word1", "word2", "word3"]


In [8]:
train_data[0]['text'] # sentence

"Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [9]:
tokenize(train_data[0]['text']) # list of words

['wall',
 'st.',
 'bears',
 'claw',
 'back',
 'into',
 'the',
 'black',
 '(reuters)',
 'reuters',
 '-',
 'short-sellers,',
 'wall',
 "street's",
 'dwindling\\band',
 'of',
 'ultra-cynics,',
 'are',
 'seeing',
 'green',
 'again.']

#### Building Vocabulary

In [10]:
counter = Counter() # map each word to its number of appearances

for item in train_data:
    counter.update(tokenize(item["text"]))

In [None]:
# index, (value, key) - indexing an iterator
for (word, num) in enumerate(counter.items()):
    print(f"{word} : {num}")
    
    break

0 : ('wall', 1375)


In [11]:
# index, (value, key)
for i, (word, num) in enumerate(counter.items()):
    print(f"{word} : {num}")
    
    if i > 5:
        break

wall : 1375
st. : 1192
bears : 344
claw : 17
back : 3868
into : 6628
the : 203234


In [12]:
len(counter)

158733

In [13]:
CONFIG["max_vocab_size"]

20000

In [14]:
# building the vocab based on MAX_VOCAB_SIZE
most_common = counter.most_common(CONFIG["max_vocab_size"] - 2) # 2 custom vocab - PAD and UNK

len(most_common)

19998

In [15]:
CONFIG["pad_token"], CONFIG["unk_token"]

('<pad>', '<unk>')

In [16]:
vocab = {
    CONFIG["pad_token"]: 0, # to pad all the sentences to the same length
    CONFIG["unk_token"]: 1  # for the unknown tokens
}

In [17]:
# Make up the vocabulary

for i, (word, _) in enumerate(most_common, start=2):
    vocab[word] = i


In [18]:
len(vocab) == CONFIG["max_vocab_size"]

True

#### Numericalization and Padding

In [19]:
def encode(text):
    tokens = tokenize(text)

    ids = [vocab.get(tok, vocab[CONFIG["unk_token"]]) for tok in tokens]

    # Padding or truncation
    if len(ids) < CONFIG["max_seq_len"]:
        ids += [vocab[CONFIG["pad_token"]]] * (CONFIG["max_seq_len"] - len(ids))
    else:
        # truncate it
        ids = ids[:CONFIG["max_seq_len"]]

    return torch.tensor(ids)

#### PyTorch Dataset

In [20]:
class AGNewsDataset(Dataset):
    """
    x: Tensor of shape (MAX_SEQ_LEN, )
    y: int (0-3) -> 4 classes
    """
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        x = encode(item["text"])
        y = item["label"]
        
        return x, y

In [21]:
### Dataloader


train_dataset = AGNewsDataset(train_data)

# batching the dataset
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

In [22]:
for item in train_dataset:
    print(item)
    break

(tensor([  369,   441,  1697, 17026,    98,    53,     2,   837,    29,    82,
           10,     1,   369,  7034,     1,     5,     1,    35,  3922,   743,
         2577,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]

#### Positional Encoding

* Transformers are parallel processing, so we use positional encoding
* Why not integers, 0 to n:
  * Scale: What if indices go very long, it will destabilize NN
  * Generalization: Model trained on sequences of len 50 won't know what to do with sequences of length 500


- 128 - maximum length of the vector
- 512 - embedding size

In [23]:
class PositionalEncoding(nn.Module):
    """
    Sinusoidal positional encoding.
    Injects word order information into embeddings coz transformer process everything in parallel.
    """
    
    def __init__(self, d_model, max_len):
        # d_model : total dim of the embedding
        super().__init__()
        
        # position of each vector 
        pe = torch.zeros(max_len, d_model) # (max_seq_len, embedding_dim)
        
        position = torch.arange(0, max_len).unsqueeze(1)
        
        # sine frequencies .. it generates unique value for all the tokens
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model) # it's a formula
        )
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer("pe", pe)
    
    def forward(self, x):
        return x + self.pe[:x.size(1)]

#### Self attention

In [24]:
class SelfAttention(nn.Module):
    # For this word, which other words matter, and how much?
    
    """
    Every word in a sentence, 
        * Looks at every other word
        * Scores their relevance
        * Collects useful information
    """
    
    def __init__(self, d_model):
        super().__init__()
        
        """
        (Q) Query   :  What am I looking for?       - search for one thing
        (K) Key     :  What do I contain?           - match based on another key
        (V) Value   :  What information do I give?  - extract something else
        """
        
        # learnable linear layer
        self.q = nn.Linear(d_model, d_model)
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)
        
        self.scale = math.sqrt(d_model)
    
    def forward(self, x): # x - input
        
        # for each input
        Q = self.q(x)
        K = self.k(x)
        V = self.v(x)
        
        # Attention scores: Q * K^T / sqrt(d_model)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        
        weights = torch.softmax(scores, dim=-1)
        
        return torch.matmul(weights, V)
        

In [25]:
class TransformerBlock(nn.Module):
    
    # constructor
    def __init__(self, d_model, ff_dim): # ff_dim -- hidden dimension of the MLP
        super().__init__()
        
        # self attention layer
        self.attention = SelfAttention(d_model) # give the weights each and every token
        
        # layer normalization after attention
        self.norm1 = nn.LayerNorm(d_model) # stability during training
        
        # learning process
        # position-wise feed-forward network
        self.ff = nn.Sequential(            # MLP - Multi Layer Perceptron
            nn.Linear(in_features=d_model, out_features=ff_dim),
            nn.ReLU(),  # regularization .. max(0, n)
            # linear computation
            nn.Linear(in_features=ff_dim, out_features=d_model),
        )
        
        # layer normalization after feed forward
        self.norm2 = nn.LayerNorm(d_model)
            
    def forward(self, x): # x - input
        # First, we get the attention output
        # first output
        attn_out = self.attention(x) # weights of input tokens
        
        # we get the residual connection - to retain the earlier information
        residual1 = x + attn_out
        
        # first normalization
        x = self.norm1(residual1) # with residual connection
        
        # MLP -- learning mechanism
        ff_out = self.ff(x) # x - input sentence
        
        residual2 = x + ff_out
        x = self.norm2(residual2)
        
        return x

In [26]:
class TransformerClassifier(nn.Module):
    """
    Full transformer-based text classifier
    """
    def __init__(self, vocab_size, d_model, ff_dim, num_layers, num_classes):
        super().__init__()
        
        # Token embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # positional encoding
        self.position_encoding = PositionalEncoding(d_model, CONFIG["max_seq_len"])
        
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, ff_dim)
            for _ in range(num_layers)
        ])
        
        self.classifier = nn.Linear(d_model, num_classes)
    
    def forward(self, x): # x - token IDs
        # x.shape() = (batch_size, max_seq_len)
        
        # we embed the input
        x = self.embedding(x)
        
        # we generate positional encoding
        x = self.position_encoding(x)
        
        # we feed to do MLP (Transformer Blocks (xN))
        for layer in self.layers:
            x = layer(x)
        
        # Global mean.. sequence pooling
        x = x.mean(dim=1)
        
        # we do the classification
        return self.classifier(x) # output - LOGITS

In [27]:
model = TransformerClassifier(
    vocab_size=CONFIG["max_vocab_size"],
    d_model=128, # size of the embedding
    ff_dim=256, # hidden size
    num_layers=2,
    num_classes=CONFIG["num_classes"] # 4
).to(CONFIG["device"]) # cpu or gpu

In [28]:
model

TransformerClassifier(
  (embedding): Embedding(20000, 128)
  (position_encoding): PositionalEncoding()
  (layers): ModuleList(
    (0-1): 2 x TransformerBlock(
      (attention): SelfAttention(
        (q): Linear(in_features=128, out_features=128, bias=True)
        (k): Linear(in_features=128, out_features=128, bias=True)
        (v): Linear(in_features=128, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=128, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=128, bias=True)
      )
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
  )
  (classifier): Linear(in_features=128, out_features=4, bias=True)
)

In [29]:
batch = next(iter(train_loader)) # 8 samples in one batch

In [30]:
x, y = batch

x = x.to(CONFIG["device"])


In [31]:
x.shape

torch.Size([8, 128])

In [32]:
logits = model(x) # before sigmoid/softmax

In [33]:
logits.shape # 4 classes, 8 samples per batch

torch.Size([8, 4])

In [34]:
logits


tensor([[-0.5783, -0.3435,  0.1400, -0.0409],
        [-0.5420, -0.3551,  0.1689, -0.0425],
        [-0.5654, -0.3405,  0.1740,  0.0239],
        [-0.6551, -0.3964,  0.1241,  0.0030],
        [-0.5374, -0.3200,  0.1583, -0.0247],
        [-0.4602, -0.3170,  0.2637, -0.0442],
        [-0.5768, -0.2853,  0.1379, -0.0125],
        [-0.6786, -0.3605,  0.1875,  0.0436]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [35]:
# Test set

test_dataset = AGNewsDataset(test_data)

test_loader = DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False
)

In [36]:
# loss function

criterion = nn.CrossEntropyLoss()

In [37]:

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=3e-4
)

### Training loop

In [38]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    
    total_loss = 0
    correct = 0
    total = 0
    
    for x, y in dataloader:
        x = x.to(device)
        y = y.to(device)
        
        # 1. Forward pass
        logits = model(x)
        
        # 2. Compute loss
        loss = criterion(logits, y) # model outputs, true labels
        
        """BACKPROP.. this is the learning to correct the model"""
        # 3. Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 4. Metrics
        total_loss += loss.item()
        
        predictions = torch.argmax(logits, dim=1)
        correct += (predictions == y).sum().item()
        total += y.size(0)
        
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    
    return avg_loss, accuracy
        

In [39]:
def evaluate(model, dataloader, criterion, device):
    model.eval()

    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)

            logits = model(x)
            loss = criterion(logits, y)

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == y).sum().item()
            total += y.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total

    return avg_loss, accuracy


In [40]:
EPOCHS = 5

for epoch in range(EPOCHS):
    train_loss, train_acc = train_one_epoch(
        model,
        train_loader,
        optimizer,
        criterion,
        CONFIG["device"]
    )

    val_loss, val_acc = evaluate(
        model,
        test_loader,
        criterion,
        CONFIG["device"]
    )

    print(
        f"Epoch {epoch+1}/{EPOCHS} | "
        f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}"
    )


Epoch 1/5 | Train Loss: 0.5043, Train Acc: 0.8116 | Val Loss: 0.3373, Val Acc: 0.8834
Epoch 2/5 | Train Loss: 0.2899, Train Acc: 0.8988 | Val Loss: 0.2893, Val Acc: 0.9043
Epoch 3/5 | Train Loss: 0.2326, Train Acc: 0.9195 | Val Loss: 0.2833, Val Acc: 0.9025
Epoch 4/5 | Train Loss: 0.1960, Train Acc: 0.9328 | Val Loss: 0.2707, Val Acc: 0.9083
Epoch 5/5 | Train Loss: 0.1697, Train Acc: 0.9412 | Val Loss: 0.2793, Val Acc: 0.9096


In [52]:
test_loss, test_acc = evaluate(
    model,
    test_loader,
    criterion,
    CONFIG["device"]
)

In [54]:
test_loss

0.27926651299600247

In [55]:
test_acc

0.9096052631578947