In [58]:
import torch as th
from torch import nn
from torch.nn import ModuleList
import numpy as np
import math
from einops import reduce, rearrange, einsum
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [16]:
from transformers import AutoTokenizer, pipeline

In [4]:
import pandas as pd

In [5]:
file = open("data/Harry_Potter.txt", 'r')
content = file.read()

In [6]:
len(content)

442745

In [33]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

In [40]:
tokenizer = AutoTokenizer.from_pretrained("gpt2-small")

In [52]:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

1

In [54]:
tokens = tokenizer(["After each block, the token tensor is normalized along it's dimension.", "The normalization itself is a learnable module."],
         padding=True, truncation=True, return_tensors='pt')

In [101]:
tokens["input_ids"]

tensor([[ 3260,  1123,  2512,    11,   262, 11241, 11192,   273,   318, 39279,
          1863,   340,   338, 15793,    13],
        [  464,  3487,  1634,  2346,   318,   257,  2193,   540,  8265,    13,
         50257, 50257, 50257, 50257, 50257]])

In [103]:
class TextData(Dataset):
    def __init__(self, data, seq_len, tokenizer):
        super().__init__()
        self.data = data
        self.seq_len = seq_len
        self.num_seq = len(self.data)//self.seq_len
        self.data = [self.data[i:i+self.seq_len] for i in range(0, self.num_seq, self.seq_len)]
        self.data = tokenizer(self.data, padding=True, truncation=True, return_tensors='pt')
        self.sample_size = len(self.data)
    def __len__(self):
        return self.sample_size
    def __getitem__(self, idx):
        return self.data["input_ids"][idx]

In [74]:
"""
After each block, the token tensor is normalized along it's dimension.
The normalization itself is a learnable module.
"""

class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1.e-5):
        super().__init__()
        self.dim = dim
        self.w = nn.Parameter(th.ones(self.dim))
        self.b = nn.Parameter(th.zeros(self.dim))
        self.eps = eps
    def forward(self, x):
        #mean and standard deviation along the dimension of token vector
        mean = x.mean(dim=-1, keepdim=True)
        std = (x.var(dim=-1, keepdim=True, unbiased=False) + self.eps).sqrt()
        return self.w*(x - mean)/std + self.b

In [82]:
"""
Embedding block consists of a lookup matrix.
The matrix consists of embedding vector for each token in the vocabulary
"""

class Embedding(nn.Module):
    def __init__(self, vocab_dim, embed_dim):
        super().__init__()
        self.vocab_dim = vocab_dim
        self.embed_dim = embed_dim
        self.embedding = nn.Parameter(th.empty(self.vocab_dim, self.embed_dim))
        nn.init.normal_(self.embedding, std=1.)
    def forward(self, tokens):
        return self.embedding[tokens-1]

In [10]:
"""
The positional embedding is calculated for once for the maximum context size and stored as a buffer.
Each token in the sequence is assigned an array of sequence of values of sine and cosine. The positional embedding adds to the input tensor.
"""

class PosEncoding(nn.Module):
    def __init__(self, dim, max_seq_len):
        super().__init__()
        #timestamp for each token
        t = th.arange(max_seq_len).reshape(-1,1)
        #each timestamp has a dimension which contains sequence of sine and cosine values
        se = th.arange(0, dim, 2).reshape(1,-1)
        ce = th.arange(0, dim if dim%2==0 else dim-1, 2).reshape(1,-1)
        se = th.exp(-8*se*math.log(10)/dim)
        ce = th.exp(-8*ce*math.log(10)/dim)
        se = (t*se).sin()
        ce = (t*ce).cos()
        encoding = th.zeros(max_seq_len, dim)
        #the sine and cosine sequence of values are interleaved together
        encoding[:,0::2] = se
        encoding[:,1::2] = ce
        self.register_buffer("encoding", encoding)
    def forward(self, x):
        #x.shape = (batch_size, seq_len, dim)
        seq_len = x.shape[1]
        return x + self.encoding[:seq_len,:]

In [11]:
'''
Takes the batched input of sequence of vectors of shape (batch_size, seq_len, embed_dim).
Each head independently calculates the masked attention pattern for the batch of sequence.
The output vector is weighted according to the attention patterns.
The output sequence of vectors are concatenated along the vector dimension.
The concatenated sequence of vectors are linear transformed to produce the output.
'''
class Attention(th.nn.Module):
    def __init__(self, num_heads, res_dim):
        super().__init__()
        self.num_heads = num_heads
        self.res_dim = res_dim
        assert self.res_dim%self.num_heads == 0
        self.head_dim = self.res_dim//self.num_heads
        self.scale = 1/math.sqrt(self.head_dim)
        self.layer_norm = LayerNorm(self.res_dim)
        #define a linear layer that map to three times the residual dimension
        #the resulting vector produced will be the concatenation of query, key and value vectors
        self.QKV = th.nn.Linear(self.res_dim, 3*self.res_dim)
        #the concatenated output from heads will be transformed using this
        self.O = th.nn.Linear(self.res_dim, self.res_dim)
    def forward(self, x):
        #x.shape = (batch_size, seq_len, res_dim)
        #q,k,v.shape = (batch_size, num_heads, seq_len, head_dim)
        x = self.layer_norm(x)
        q, k, v = rearrange(self.QKV(x), 'b s (qkv h d) -> qkv b h s d', qkv=3, h=self.num_heads, d=self.head_dim)
        attn_patterns = self.attnPatterns(q, k)
        weighted_v = self.weightedValues(attn_patterns, v)
        output_v = rearrange(weighted_v, 'b h s d -> b s (h d)')
        return self.O(output_v)
    #q, k - (batch_size, num_heads, seq_len, head_dim)
    #attention patterns are stored row-wise for each sequence element
    def attnScores(self, q, k):
        return einsum(q, k, '... s1 d1, ... s2 d1 -> ... s1 s2')*self.scale
    def attnPatterns(self, q, k):
        attn_scores = self.attnScores(q, k)
        dim = attn_scores.shape[-1]
        IGNORE = -1.e6
        a = th.arange(dim).reshape(-1,1)
        b = th.arange(dim).reshape(1,-1)
        mask = a<b
        attn_scores[..., mask] = IGNORE
        return th.softmax(attn_scores, dim=-1)
    def weightedValues(self, attn_patterns, values):
        return einsum(attn_patterns, values, '... qpos seq, ... seq dim -> ... qpos dim')

In [13]:
"""
The MLP block consists of two linear layers, an activation, and a layer norm in the final layer.
"""

class MLPBlock(th.nn.Module):
    def __init__(self, res_dim, hidden_dim):
        super().__init__()
        self.res_dim = res_dim
        self.hidden_dim = hidden_dim
        self.l1 = th.nn.Linear(self.res_dim, self.hidden_dim)
        self.l2 = th.nn.Linear(self.hidden_dim, self.res_dim)
        self.act = th.nn.GELU()
        self.layer_norm = LayerNorm(self.res_dim)
    def forward(self, x):
        x = self.layer_norm(x)
        return self.l2(self.act(self.l1(x)))

In [14]:
"""
The transformer block consists of an attention and an MLP blocks.
"""

class TransformerBlock(th.nn.Module):
    def __init__(self, num_heads, res_dim):
        super().__init__()
        self.num_heads = num_heads
        self.res_dim = res_dim
        self.hidden_dim = 4*self.res_dim
        self.attention = Attention(self.num_heads, self.res_dim)
        self.mlp = MLPBlock(self.res_dim, self.hidden_dim)
    def forward(self, x):
        x = x + self.attention(x)
        x = x + self.mlp(x)
        return x

In [65]:
"""
The training function takes the text file.
The forward method takes a batch of tokenized sequence.
Converts the text to a batch of sequences.
Tokenize the batch, embed it, and add positional embedding.
The resulting tensor is passed to transformer blocks successively.

"""

class Transformer(th.nn.Module):
    def __init__(self, num_layers=2, num_heads=12, res_dim=784, vocab_dim):
        super().__init__()
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.res_dim = res_dim
        self.vocab_dim = vocab_dim
        self.max_seq_len = 1024
        assert self.res_dim%self.num_heads == 0
        
        self.embedding = Embedding(self.vocab_dim, self.res_dim)
        self.pos_encoding = PosEncoding(self.res_dim, self.max_seq_len)
        self.layers = ModuleList(
            [TransformerBlock(self.num_heads, self.res_dim)
            for i in range(self.num_layers)])
        self.out_layer = th.nn.Linear(self.res_dim, self.vocab_dim)
    def setup(self, batch_size, epochs, lr=1.e-3):
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x)
        return self.out_layer(x)
    def trainStep(self, dataset):
        data_loader = DataLoader(dataset, batch_size, shuffle=True)
        for batch in data_loader:
            pred_logits = self(batch)
            