### Transformer Model Pipeline

#### Imports

In [1]:
# System dependencies
import os
import sys
import re
import math

# Torch dependencies
import torch
import torch.nn as nn

# Data loaders
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

# Utils
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

#### Config

In [None]:
CONFIG = {
    ### Data config
    "num_classes": 4,
    
    ### NLP config
    "max_vocab_size": 20000,
    "pad_token": "<pad>",
    "unk_token": "<unk>",
    
    ### system settings
    "seed": 42,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

In [4]:
def setup():
    torch.manual_seed(CONFIG["seed"])
    
setup()

#### Dataset

In [7]:
dataset = load_dataset("ag_news")

train_data = dataset["train"]
test_data = dataset["test"]

label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

#### Natural Language Processing

In [None]:
### Tokenizer

def tokenize(text):
    # text.lower() - lowercase all the text samples
    # text.split() - split the sentence into words
    return text.lower().split() # ["word1", "word2", "word3"]


In [18]:
train_data[0]['text']

"Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [17]:
tokenize(train_data[0]['text'])

['wall',
 'st.',
 'bears',
 'claw',
 'back',
 'into',
 'the',
 'black',
 '(reuters)',
 'reuters',
 '-',
 'short-sellers,',
 'wall',
 "street's",
 'dwindling\\band',
 'of',
 'ultra-cynics,',
 'are',
 'seeing',
 'green',
 'again.']

#### Building Vocabulary

In [None]:
counter = Counter() # map each word to its number of appearances

for item in train_data:
    counter.update(tokenize(item["text"]))

In [23]:
# index, (value, key)
for i, (word, num) in enumerate(counter.items()):
    print(f"{word} : {num}")
    
    if i > 5:
        break

wall : 1375
st. : 1192
bears : 344
claw : 17
back : 3868
into : 6628
the : 203234


In [26]:
len(counter)

158733

In [30]:
CONFIG["max_vocab_size"]

20000

In [None]:
# building the vocab based on MAX_VOCAB_SIZE
most_common = counter.most_common(CONFIG["max_vocab_size"] - 2) # 2 custom vocab - PAD and UNK

len(most_common)

19998

In [32]:
CONFIG["pad_token"], CONFIG["unk_token"]

('<pad>', '<unk>')

In [31]:
vocab = {
    CONFIG["pad_token"]: 0, # to pad all the sentences to the same length
    CONFIG["unk_token"]: 1  # for the unknown tokens
}

In [34]:
# Make up the vocabulary

for i, (word, _) in enumerate(most_common, start=2):
    vocab[word] = i


In [37]:
len(vocab) == CONFIG["max_vocab_size"]

True