# Transformer from scratch yusing PyTorch
Implementing the Transformer architecture from scratch
![Transformer](https://madewithml.com/static/images/foundations/transformers/architecture.png)

In [9]:
#setup 

import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn

SEED = 1234

def set_seeds(seed=1234):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # multi-GPU

# Set seeds for reproducibility
set_seeds(seed=SEED)

# Set device
cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
print("Device is:",device)


Device is: cpu


In [11]:
# Load data
url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/news.csv"
df = pd.read_csv(url, header=0) # load
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()

# Reduce data size (too large to fit in Colab's limited memory)
df = df[:10000]
print (len(df))

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()


def preprocess(text, stopwords=STOPWORDS):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in parenthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

# Sample
text = "Great week for the NYSE!"
preprocess(text=text)

# Apply to dataframe
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print (f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")


import collections
from sklearn.model_selection import train_test_split
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15
def train_val_test_split(X, y, train_size):
    """Split dataset into data splits."""
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test

# Data
X = preprocessed_df["title"].values
y = preprocessed_df["category"].values

# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} → {y_train[0]}")


10000


[nltk_data] Downloading package stopwords to /home/gitpod/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we']
Bucs Trade Keenan McCardell to Chargers (AP)

bucs trade keenan mccardell chargers
X_train: (7000,), y_train: (7000,)
X_val: (1500,), y_val: (1500,)
X_test: (1500,), y_test: (1500,)
Sample point: net developer guide windows security → Sci/Tech


In [12]:
# label encoder

import itertools
class LabelEncoder(object):
    """Label encoder for tag labels."""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}  # mutable defaults ;)
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())

    def __len__(self):
        return len(self.class_to_index)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self

    def encode(self, y):
        y_one_hot = np.zeros((len(y), len(self.class_to_index)), dtype=int)
        for i, item in enumerate(y):
            y_one_hot[i][self.class_to_index[item]] = 1
        return y_one_hot

    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            index = np.where(item == 1)[0][0]
            classes.append(self.index_to_class[index])
        return classes

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {'class_to_index': self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index


{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

In [16]:
# Class weights
counts = np.bincount([label_encoder.class_to_index[class_] for class_ in y_train])
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"counts: {counts}\nweights: {class_weights}")

counts: [1737 1804 1692 1767]
weights: {0: 0.0005757052389176742, 1: 0.0005543237250554324, 2: 0.000591016548463357, 3: 0.0005659309564233164}


In [17]:
# Convert labels to tokens
print (f"y_train[0]: {y_train[0]}")
y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)
print (f"y_train[0]: {y_train[0]}")
print (f"decode([y_train[0]]): {label_encoder.decode([y_train[0]])}")


y_train[0]: Sci/Tech
y_train[0]: [0 1 0 0]
decode([y_train[0]]): ['Sci/Tech']


In [18]:
#tokenizer 
from transformers import DistilBertTokenizer
from transformers import BertTokenizer

# Load tokenizer and model
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
vocab_size = len(tokenizer)
print (vocab_size)

# Tokenize inputs
encoded_input = tokenizer(X_train.tolist(), return_tensors="pt", padding=True)
X_train_ids = encoded_input["input_ids"]
X_train_masks = encoded_input["attention_mask"]
print (X_train_ids.shape, X_train_masks.shape)
encoded_input = tokenizer(X_val.tolist(), return_tensors="pt", padding=True)
X_val_ids = encoded_input["input_ids"]
X_val_masks = encoded_input["attention_mask"]
print (X_val_ids.shape, X_val_masks.shape)
encoded_input = tokenizer(X_test.tolist(), return_tensors="pt", padding=True)
X_test_ids = encoded_input["input_ids"]
X_test_masks = encoded_input["attention_mask"]
print (X_test_ids.shape, X_test_masks.shape)


  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 228k/228k [00:00<00:00, 1.23MB/s]
Downloading: 100%|██████████| 385/385 [00:00<00:00, 340kB/s]


31090
torch.Size([7000, 26]) torch.Size([7000, 26])
torch.Size([1500, 22]) torch.Size([1500, 22])
torch.Size([1500, 26]) torch.Size([1500, 26])


In [19]:
# Decode
print (f"{X_train_ids[0]}\n{tokenizer.decode(X_train_ids[0])}")

# Sub-word tokens
print (tokenizer.convert_ids_to_tokens(ids=X_train_ids[0]))


tensor([  102,  3657, 18927,  6862,  9230,  3594,   103,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0])
[CLS] net developer guide windows security [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
['[CLS]', 'net', 'developer', 'guide', 'windows', 'security', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [20]:
class TransformerTextDataset(torch.utils.data.Dataset):
    def __init__(self, ids, masks, targets):
        self.ids = ids
        self.masks = masks
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __str__(self):
        return f"<Dataset(N={len(self)})>"

    def __getitem__(self, index):
        ids = torch.tensor(self.ids[index], dtype=torch.long)
        masks = torch.tensor(self.masks[index], dtype=torch.long)
        targets = torch.FloatTensor(self.targets[index])
        return ids, masks, targets

    def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
        return torch.utils.data.DataLoader(
            dataset=self,
            batch_size=batch_size,
            shuffle=shuffle,
            drop_last=drop_last,
            pin_memory=False)

# Create datasets
train_dataset = TransformerTextDataset(ids=X_train_ids, masks=X_train_masks, targets=y_train)
val_dataset = TransformerTextDataset(ids=X_val_ids, masks=X_val_masks, targets=y_val)
test_dataset = TransformerTextDataset(ids=X_test_ids, masks=X_test_masks, targets=y_test)
print ("Data splits:\n"
    f"  Train dataset:{train_dataset.__str__()}\n"
    f"  Val dataset: {val_dataset.__str__()}\n"
    f"  Test dataset: {test_dataset.__str__()}\n"
    "Sample point:\n"
    f"  ids: {train_dataset[0][0]}\n"
    f"  masks: {train_dataset[0][1]}\n"
    f"  targets: {train_dataset[0][2]}")

# Create dataloaders
batch_size = 128
train_dataloader = train_dataset.create_dataloader(
    batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(
    batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(
    batch_size=batch_size)
batch = next(iter(train_dataloader))
print ("Sample batch:\n"
    f"  ids: {batch[0].size()}\n"
    f"  masks: {batch[1].size()}\n"
    f"  targets: {batch[2].size()}")


Data splits:
  Train dataset:<Dataset(N=7000)>
  Val dataset: <Dataset(N=1500)>
  Test dataset: <Dataset(N=1500)>
Sample point:
  ids: tensor([  102,  3657, 18927,  6862,  9230,  3594,   103,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0])
  masks: tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])
  targets: tensor([0., 1., 0., 0.])
Sample batch:
  ids: torch.Size([128, 26])
  masks: torch.Size([128, 26])
  targets: torch.Size([128, 4])


  ids = torch.tensor(self.ids[index], dtype=torch.long)
  masks = torch.tensor(self.masks[index], dtype=torch.long)


In [29]:
#Trainer
from torch.utils.data.dataloader import DataLoader

class Trainer(object):
    def __init__(self,model,device,loss_fn=None,optimizer=None,scheduler=None):
        """Initialize parameters"""
        # Set params
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler

    def train_step(self,dataloader:DataLoader):
        """Train step."""

        # set model to train mode
        self.model.train()
        loss = 0.0 #initialize loss to zero

        # Iterate over train batches
        for i,batch in enumerate(dataloader):
            # step
            batch = [item.to(self.device) for item in batch] #set device
            inputs,targets = batch[:-1],batch[-1]

            self.optimizer.zero_grad() #reset gradients
            z = self.model(inputs) # forward pass
            J = self.loss_fn(z,targets) # calculate loss
            J.backward() # backward pass
            self.optimizer.step() # Update weights

            # Cumulative Metrics
            loss += (J.detach.item() - loss) / (i + 1)
        
        return loss
    
    def eval_step(self,dataloader:DataLoader):
        """Validation or test step."""

        # set model to eval mode
        self.model.eval()
        loss = 0.0 #initialize loss to zero
        y_trues,y_probs = [],[]

        # Iterate over val batches
        with torch.inference_mode():
            for i,batch in enumerate(dataloader):
                # step
                batch = [item.to(self.device) for item in batch] #set device
                inputs,y_true = batch[:-1],batch[-1]

                z = self.model(inputs) # forward pass
                J = self.loss_fn(z,targets).item() # calculate loss

                # Cumulative Metrics
                loss += (J - loss) / (i + 1)

                # Store outputs
                y_prob = F.softmax(z).cpu().numpy()
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())
        
        return loss, np.vstack(y_trues), np.vstack(y_probs)
    
    def predict_step(self, dataloader:DataLoader):
        """Prediction step."""

        # Set model to eval mode
        self.model.eval()
        y_probs = []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Forward pass w/ inputs
                inputs, targets = batch[:-1], batch[-1]
                z = self.model(inputs)

                # Store outputs
                y_prob = F.softmax(z).cpu().numpy()
                y_probs.extend(y_prob)

        return np.vstack(y_probs)

    def train(self, num_epochs, patience, train_dataloader:DataLoader, val_dataloader:DataLoader):
        best_val_loss = np.inf
        for epoch in range(num_epochs):
            # Steps
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            self.scheduler.step(val_loss)

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience  # reset _patience
            else:
                _patience -= 1
            if not _patience:  # 0
                print("Stopping early!")
                break

            # Logging
            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )
        return best_model
        

## Transformer
At first , let's learn about the components of the Transformer architecture and then implement one for our text classification task.

```
## Birds Eye View:(Top Down Approach)

Scaled Dot-Product Attention(Q,K,V)
                |
        Multi Head Attention
                |
             Encoder     
```           

#### Scaled dot-product attention
The most popular type of self-attention is scaled 
dot-product attention from widely-cited Attention is all you need paper.
This type of attention involves projecting our encoded input sequences(X)
into three matrices, queries(Q),keys(K) and values(V), whose weights we learn.

\begin{equation*}
  Q = XW_{q} where W_{q}\in \mathbb{R}^{HXd_{q}}
\end{equation*}

\begin{equation*}
  K = XW_{k} where W_{k}\in \mathbb{R}^{HXd_{k}}
\end{equation*}

\begin{equation*}
  V = XW_{v} where W_{V}\in \mathbb{R}^{HXd_{V}}
\end{equation*}

\begin{equation*}
  attention(Q,K,V) = softmax(\dfrac{QK^{T}}{\sqrt{d_{k}}})V \in \mathbb{R}^{MXd_{v}}
\end{equation*}

| Variable      | Description              |
| -----------   | -----------              |
|     X         | encoded inputs E R(NXMXH)|
|     N         | batch size               |
|     M         | max sequence length in the batch |
|     H         | hidden dim, model dim ,etc. |
|     Wq        | query weights E R(HXDq) |
|     Wk        | key weights E R(HXDk) |
|     Wv        | value weights E R(HXDv) |

#### Multi-head attention

Instead of applying self-attention only once accross
the entire encoded input,we can also separate
the input and apply self-attention in parallel(heads) to
each input section and concatenate them. This allows
the different head to learn unique representations while
maintaining the complexity since we split the input into smaller subspaces.

\begin{equation*}
MultiHead(Q,K,V) = concat(head_{1},..,head_{h})W_{O}
\end{equation*}

\begin{equation*}
head_{i} = attention(Q_{i},K_{i},V_{i})
\end{equation*}

| Variable      | Description              |
| -----------   | -----------              |
|     h         | number of attention heads|
|     Wo        | multi-head attention weights |
|     H         | hidden dim (or dimension of the model dmodel) |

#### Positional Encoding 
With self-attention, we aren't able to account for the sequential position of our input tokens. To address this, we can use positional encoding to create a representation of the location of each token with respect to the entire sequence. 
This can either be learned (with weights) or we can use a fixed function that can better extend to create positional encoding for lengths during inference that were not observed during training.

\begin{equation*}
PE_{(pos,2i)} = sin(pos/10000^{2i/H})
\end{equation*}

\begin{equation*}
PE_{(pos,2i+1)} = cos(pos/10000^{2i/H})
\end{equation*}

| Variable      | Description              |
| -----------   | -----------              |
|     pos       | position of the token(1...M)|
|     i         | hidden dim (1..H) |

This effectively allows us to represent each token's
relative position using a fixed function for very large sequences. And because we've constrained the positional encodings to have the same dimensions as our encoded inputs,we can simply concatenate them before feeding the into multi-head attention heads.


In [30]:
from transformers import BertModel

In [32]:
# transformer = BertModel.from_pretrained("distilbert-base-uncased")
# embedding_dim = transformer.config.dim
transformer = BertModel.from_pretrained("allenai/scibert_scivocab_uncased")
embedding_dim = transformer.config.hidden_size

Downloading: 100%|██████████| 442M/442M [00:05<00:00, 78.1MB/s] 
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
class Transformer(nn.Module):
    def __init__(self, transformer, dropout_p, embedding_dim, num_classes):
        super(Transformer, self).__init__()
        self.transformer = transformer
        self.dropout = torch.nn.Dropout(dropout_p)
        self.fc1 = torch.nn.Linear(embedding_dim, num_classes)

    def forward(self, inputs):
        ids, masks = inputs
        seq, pool = self.transformer(input_ids=ids, attention_mask=masks)
        z = self.dropout(pool)
        z = self.fc1(z)
        return z


num_classes=NUM_CLASSES
# Initialize model
dropout_p = 0.5
model = Transformer(
    transformer=transformer, dropout_p=dropout_p,
    embedding_dim=embedding_dim, num_classes=num_classes)
model = model.to(device)
print(model.named_parameters)

<bound method Module.named_parameters of Transformer(
  (transformer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [49]:
# Training
lr = 1e-4
num_epochs = 10
patience = 10

# Define loss
class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)

# Define optimizer & scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=5)

# Trainer module
trainer = Trainer(
    model=model, device=device, loss_fn=loss_fn,
    optimizer=optimizer, scheduler=scheduler)


In [50]:
# Train
best_model = trainer.train(num_epochs, patience, train_dataloader, val_dataloader)

  ids = torch.tensor(self.ids[index], dtype=torch.long)
  masks = torch.tensor(self.masks[index], dtype=torch.long)


TypeError: dropout(): argument 'input' (position 1) must be Tensor, not str