# Introduction

1. Core idea

For now we will use the DL tools (PyTorch) in case that could be feasible to apply all the necessary techniques. Through this lesson will be obtained: \
- custom tokenizers \
- all self-written blocks \
- all linear algebra manipulations from a nuttshell \
Prepare for the hands-on coding!


2. Resources and requirements

  *-* PyTorch \
  *-* Hugging Face \
  *-* NLTK, re


3. Outline

* encoder \
* decoder \
* encoder-decoder models


4. Restrictions

It's a low-scale reconstruction and not even close to a GPT/BERT


In [None]:
# essential toolkit

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import math
import numpy as np
import matplotlib.pyplot as plt

## Multihead Attention module

Construct it, assuming that `K` == `V` (key and value respectively), num of input features for Linear Layers is `d_model` and `n_heads` for output features as well.

In this part we will implement the `Attention` itself:

$$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$$

While

$$Q = W^{(Q)}X$$
$$K = W^{(K)}X$$
$$V = W^{(V)}X$$

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_K: int, d_model: int, n_heads: int) -> None:
        super().__init__()
        # d_K = d_V
        self.d_K = d_K
        self.n_heads = n_heads
        self.key = nn.Linear(in_features=d_model,
                             out_features=d_K * n_heads)
        self.query = nn.Linear(in_features=d_model,
                      out_features=d_K * n_heads)
        self.value = nn.Linear(in_features=d_model,
                      out_features=d_K * n_heads)

        # final fully connected linear layer
        self.fc = nn.Linear(in_features=d_K * n_heads,
                            out_features=d_model)

    def forward(self, query, key, value, mask=None):
        query = self.query(query) # N x T x (hd_Q)
        key = self.key(key) # N x T x (hd_K)
        value = self.value(value) # N x T x (hd_V)
        # get the dimensions
        N = query.shape[0]
        T = query.shape[1]
        # swap dimensions order for proper matrix multiplication
        # N x T x H x d_K -> N x H x T x d_K
        query = query.view(N, T, self.n_heads, self.d_K).transpose(1, 2)
        key = key.view(N, T, self.n_heads, self.d_K).transpose(1, 2)
        value = value.view(N, T, self.n_heads, self.d_K).transpose(1, 2)
        # compute attention weights
        # (N x H x T x d_K) x (N x H x d_K x T ) -> (N, H, T, T)
        attention_logits = torch.matmul(query,  key.transpose(-2, -1)) / math.sqrt(self.d_K)
        if mask is not None:
            attention_logits = attention_logits.masked_fill(
                mask=mask[:, None, None, :] == 0,
                value=float('-inf')
            )
        attention_weights = F.softmax(attention_logits, dim=-1)
        # compute attention weighted values
        attention = torch.matmul(attention_weights, value)
        # before inputing at final FC layer transpose back
        attention = attention.transpose(1, 2) # to (N x T x H x d_K)
        attention = attention.contiguous().view(N, T, self.d_K * self.n_heads)

        return self.fc(attention)

## Transformer Block

The next big thing is to wrap up the sequence of manipulations of `LayerNorms` with already written `MultiheadAttention`, enforced with activations and dropouts

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, d_K: int, d_model: int, n_heads: int, dropout_rate: float = 0.1):
        super().__init__()

        self.layer_norm_1 = nn.LayerNorm(normalized_shape=d_model)
        self.layer_norm_2 = nn.LayerNorm(normalized_shape=d_model)
        self.mh_attention = MultiHeadAttention(d_K=d_K,
                                               d_model=d_model,
                                               n_heads=n_heads)
        self.network = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=d_model * 4),
            nn.GELU(),
            nn.Linear(in_features=d_model * 4, out_features=d_model),
            nn.Dropout(p=dropout_rate)
        )
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, x, mask=None):
        x = self.layer_norm_1(x + self.mh_attention(x, x, x, mask))
        x = self.layer_norm_1(x + self.network(x))
        x = self.dropout(x)
        return x



## Positional Encoding

$$PE_{(pos, 2i)} = sin(\frac{pos}{10000^{\frac{2i}{d_{model}}}})$$

$$PE_{(pos, 2i+1)} = cos(\frac{pos}{10000^{\frac{2i}{d_{model}}}})$$

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 2048, dropout_rate=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout_rate)
        position = torch.arange(max_len).unsqueeze(1)
        exp_term = torch.arange(0, d_model, 2)
        div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
        pos_enc = torch.zeros(1, max_len, d_model)
        pos_enc[0, :, 0::2] = torch.sin(position * div_term)
        pos_enc[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pos_enc', pos_enc)

    def forward(self, x):
        # x.shape: N x T x D
        x = x + self.pos_enc[:, :x.size(1), :]
        return self.dropout(x)

## Encoder block

In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 vocab_size,
                 max_len,
                 d_K,
                 d_model,
                 n_heads,
                 n_layers,
                 n_classes,
                 dropout_rate):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=d_model)
        self.pos_encodding = PositionalEncoding(
            d_model=d_model, max_len=max_len, dropout_rate=dropout_rate
        )
        transformer_blocks = [
            TransformerBlock(
                d_K=d_K,
                d_model=d_model,
                n_heads=n_heads,
                dropout_rate=dropout_rate
            ) for _ in range(n_layers)
        ]
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.norm = nn.LayerNorm(normalized_shape=d_model)
        self.fc = nn.Linear(in_features=d_model, out_features=n_classes)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.pos_encodding(x)
        for block in self.transformer_blocks:
            x = block(x, mask)
        # x will be many-to-one as x has (N x T x D) shape
        x = x[:, 0, :]
        # normalization and linear transformation
        x = self.norm(x)
        x = self.fc(x)
        return x

In [None]:
model = Encoder(
    vocab_size=20000,
    max_len=1024,
    d_K=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    n_classes=5,
    dropout_rate=0.1
)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)
x = torch.randint(0, 20000, size=(8, 512)).to(device)

mask = np.ones((8, 512))
mask[: , 256:] = 0
mask = torch.tensor(mask).to(device)

In [None]:
y = model(x, mask)

In [None]:
y.shape

torch.Size([8, 5])

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

In [None]:
raw_data = load_dataset("glue", "sst2")

Downloading readme:   0%|          | 0.00/31.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
def tokenizer_func(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [None]:
tokenized_dataset = raw_data.map(tokenizer_func, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns([
    "sentence", "idx"
])
tokenized_dataset = tokenized_dataset.rename_column(
    "label", "labels"
)

tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
from torch.utils.data import DataLoader

In [None]:
train_dl = DataLoader(
    tokenized_dataset["train"],
    batch_size=32,
    shuffle=True,
    collate_fn=data_collator
)

valid_dl = DataLoader(
    tokenized_dataset["validation"],
    batch_size=32,
    shuffle=False,
    collate_fn=data_collator
)

# checkout how it works
for batch in train_dl:
    for key, value in batch.items():
        print("key: ", key, "value.shape ", value.shape)
    break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


key:  labels value.shape  torch.Size([32])
key:  input_ids value.shape  torch.Size([32, 50])
key:  attention_mask value.shape  torch.Size([32, 50])


In [None]:
set(tokenized_dataset["train"]["labels"])

{0, 1}

In [None]:
tokenizer.vocab_size

28996

In [None]:
tokenizer.max_model_input_sizes

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [None]:
model = Encoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_model_input_sizes[checkpoint],
    d_K=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    n_classes=2,
    dropout_rate=0.1
)

model.to(device)

Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encodding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (layer_norm_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mh_attention): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (network): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (la

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
from datetime import datetime

In [None]:
def train(model, criterion, optimizer, train_dl, valid_dl, epochs):
    train_losses = np.zeros(epochs)
    valid_losses = np.zeros(epochs)

    for iteration in range(epochs):
        model.train()
        tic = datetime.now()
        # accumulate loss per batch
        train_loss = 0
        # accumulate the leghts of batch used
        n_train = 0
        for batch in train_dl:
            # to gpu
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(
                batch['input_ids'], batch['attention_mask']
            )
            loss = criterion(output, batch['labels'])
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch['input_ids'].size(0)
            n_train += batch['input_ids'].size(0)

        # get an average loss per epoch
        train_loss = train_loss / n_train

        # evaluation
        model.eval()
        # accumulate loss per batch
        test_loss = 0
        # accumulate the leghts of batch used
        n_test = 0
        for batch in valid_dl:
            # to gpu
            batch = {k: v.to(device) for k, v in batch.items()}
            output = model(
                batch['input_ids'], batch['attention_mask']
            )
            loss = criterion(output, batch['labels'])
            test_loss += loss.item() * batch['input_ids'].size(0)
            n_test += batch['input_ids'].size(0)
        # also get average valid loss per epoch
        test_loss = test_loss / n_test

        # save the results
        train_losses[iteration] = train_loss
        valid_losses[iteration] = test_loss

        tac = datetime.now()

        iter_time = tac - tic

        print(f"Epoch {iteration + 1}/{epochs}, Train_loss: {train_loss:.4f} |"
              f"Test Loss: {test_loss:.4f}, Duration: {iter_time}")

    return train_losses, valid_losses





In [None]:
train_loss, val_loss = train(
    model, criterion, optimizer, train_dl, valid_dl, epochs=5
)

Epoch 1/5, Train_loss: 0.5225 |Test Loss: 0.5054, Duration: 0:00:18.348250
Epoch 2/5, Train_loss: 0.3596 |Test Loss: 0.5113, Duration: 0:00:19.072927
Epoch 3/5, Train_loss: 0.2958 |Test Loss: 0.5235, Duration: 0:00:22.374188
Epoch 4/5, Train_loss: 0.2548 |Test Loss: 0.5461, Duration: 0:00:29.330226
Epoch 5/5, Train_loss: 0.2291 |Test Loss: 0.4900, Duration: 0:00:25.931266


In [None]:
# accuracy

model.eval()
# train acc
n_correct = 0.
n_total = 0.
for batch in train_dl:
    batch = {k: v.to(device) for k, v in batch.items()}
    output = model(batch['input_ids'], batch['attention_mask'])
    _, predictions = torch.max(output, dim=1)
    n_correct += (predictions == batch['labels']).sum().item()
    n_total += batch['labels'].shape[0]

train_acc = n_correct / n_total

# valid acc
n_correct = 0.
n_total = 0.
for batch in valid_dl:
    batch = {k: v.to(device) for k, v in batch.items()}
    output = model(batch['input_ids'], batch['attention_mask'])
    _, predictions = torch.max(output, dim=1)
    n_correct += (predictions == batch['labels']).sum().item()
    n_total += batch['labels'].shape[0]

valid_acc = n_correct / n_total


In [None]:
print("train_acc: ", train_acc)
print("valid_acc: ", valid_acc)

train_acc:  0.9404000059392121
valid_acc:  0.8222477064220184
