In [1]:
exec(open('init_notebook.py').read())

current working dirF:\ml-from-scratch


In [2]:
%load_ext autoreload
%autoreload 2

# Resources
1. Generative Deep Learning by David Foster
2. https://github.com/karpathy/minGPT/blob/master/demo.ipynb

GPT is a **decoder** only model (pretraining model to be used by different decoders). We will build a casual masked language model (Casual MLM)

# PipeLine
1. Preprocessing Text -> Tokenize -> Sentence Clipper/Padder -> Input Dataset
2. Input Dataset -> token embedding + positional embedding -> input embedding
3. Input Sequence Embeddings -> self-attention A(Q,K,V) -> single head
4. Single Heads -> concat attentions -> Multihead

## Encoder
5. multi-head + query -> layer-norm -> FFNs -> layer-norm -> output next token

In GPT, both token and positional embeddings are learned.


**This is the GPT implementation in Pytorch. The book has the tensorflow+keras implementation. Borrowed some ideas from minGPT**
### Dependencies:
1. Pytorch 2.x (should work in 1.x, too, as we are using as less as possible from torch)
2. Pandas
3. scikit-learn
4. tqdm
5. numpy
6. matplotlib

In [50]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import pandas as pd
import os
import re
import string
from typing import *
import numpy.typing as npt
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
import heapq

# Custom types
Index = int
Char = str
Path = str
Sentence = str
TokenizedSentence = List[int]
Word = str
TokenId = int
Corpus = List[Sentence]
TokenizedCorpus = List[TokenizedSentence]
Vocabulary = Dict[Word, TokenId]

# Torch types, we use numpy typing
SequenceLength = int
EmbeddingDim = int
Vector = npt.NDArray 
SequenceEmbedding = npt.NDArray[Vector] # [seqLen, embeddingDim]
SequenceBatch = npt.NDArray[SequenceEmbedding] # [batch, seqlen, embeddingDim]

# Attention types
Query = Vector
Queries = npt.NDArray[Query]
BatchQueries = npt.NDArray[Queries]
Key = Vector
Keys = npt.NDArray[Key]
BatchKeys = npt.NDArray[Keys]
Value = Vector
Values = npt.NDArray[Value]
BatchValues = npt.NDArray[Values]
AttentionMask = npt.NDArray[SequenceEmbedding] # [batch, seqlen, embeddingDim]

Attention = Value # a single weighted value embedding for a query
Attentions = npt.NDArray[Attention] # for all the queries of a sentence
BatchAttentions = npt.NDArray[Attentions] # for a batch of sentences

In [4]:
# Configurations
dataDir = "E:/Datasets/wine-reviews"
vocabSize = 10_000
sentenceLength = 100
embeddingDim = 50

In [6]:
#######################################################################################################
############# The Pipeline class which is the orchestration of each step of the process ###############
#######################################################################################################
class Pipeline:
    def __init__(self, vocabSize: int, sentenceLength: int, embeddingDim: int):
        self.vocabSize = vocabSize
        self.sentenceLength = sentenceLength
        self.embeddingDim = embeddingDim
        pass
        
    def save(self, directory: Path):
        pass
        
    def preprocess(self, corpus: Corpus) -> Corpus:
        pass
    def tokenize(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        pass
    def createDatasets(self, tokenizedCorpus: TokenizedCorpus, sentenceLength) -> Tuple[Dataset, Dataset]:
        pass
    
pipeline = Pipeline(vocabSize, sentenceLength, embeddingDim)

## 1.1 Preprocessing
1. read text,
2. pad punctuations with spaces to convert them to individual words.                                                                                                                                                                                                                                                                       

In [7]:
df = pd.read_csv(os.path.join(dataDir, "winemag-data-130k-v2.csv"))

In [8]:
# df.head()

In [9]:
corpus = df["description"].tolist()

In [10]:
# data[:10]

In [11]:
class PreProcessorText:
    
    def padPunk(self, s: Sentence) -> Sentence: # this does not work well. We will do BPE next. isn't -> becomes bad is ' t. now t is a word! It's it ' s
        s = re.sub(f"([{string.punctuation}])", r" \1 ", s) #to words
        s = re.sub(' +', ' ', s) # multiple spaces to one
        return s
    
    def smallCase(self, corpus: List[Sentence]) -> List[Sentence]:
        return [s.lower() for s in corpus]

    def __call__(self, corpus: List[Sentence]) -> List[Sentence]:
        corpus = [self.padPunk(s) for s in corpus] 
        return self.smallCase(corpus)
        
pipeline.preprocess = lambda corpus: PreProcessorText()(corpus)
    
    

In [12]:
corpus = pipeline.preprocess(corpus)

In [13]:
corpus[:3]

["aromas include tropical fruit , broom , brimstone and dried herb . the palate isn ' t overly expressive , offering unripened apple , citrus and dried sage alongside brisk acidity . ",
 "this is ripe and fruity , a wine that is smooth while still structured . firm tannins are filled out with juicy red berry fruits and freshened with acidity . it ' s already drinkable , although it will certainly be better from 2016 . ",
 'tart and snappy , the flavors of lime flesh and rind dominate . some green pineapple pokes through , with crisp acidity underscoring the flavors . the wine was all stainless - steel fermented . ']

## 1.2 Tokenize
1. vocabulary
2. tokenize sentences

In [14]:
class SentenceTokenizer:
    
    def getVocabulary(self, corpus: Corpus, vocabSize: int) -> Vocabulary:
        vectorizer = CountVectorizer().fit(corpus)
        topWords = heapq.nlargest(vocabSize-1, vectorizer.vocabulary_, key=lambda w: vectorizer.vocabulary_[w])
        wordToToken = {}
        for idx, w in enumerate(topWords):
            wordToToken[w] = idx
    
        wordToToken['UNK'] = len(wordToToken)
        print(f"Created a vocabulary with top {len(wordToToken)} words from {len(vectorizer.vocabulary_)} words with UNK as the last word")
        return wordToToken

    def tokenizeSentence(self, sentence: Sentence, vocabulary: Vocabulary) -> TokenizedSentence:
        words = sentence.split()
        tSen = [
            vocabulary[w] if w in vocabulary
            else vocabulary["UNK"]
                for w in words
        ]
        return tSen

    def build(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        vocabulary = self.getVocabulary(corpus, vocabSize)
        tokenizedCorpus = [self.tokenizeSentence(s, vocabulary) for s in corpus]
        return vocabulary, tokenizedCorpus

    def __call__(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        return self.build(corpus, vocabSize)

pipeline.tokenize = lambda corpus: SentenceTokenizer()(corpus, pipeline.vocabSize)
        



In [15]:
vocabulary, tokenizedCorpus = pipeline.tokenize(corpus)
# sentenceTokenizer = SentenceTokenizer()
# vocabulary, tokenizedCorpus = sentenceTokenizer.build(corpus, vocabSize)
vocabulary["the"]

Created a vocabulary with top 10000 words from 31274 words with UNK as the last word


3415

In [16]:
print(tokenizedCorpus[0]) # mostl UNKs as we have over 30 words

[9999, 9999, 2608, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 3415, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 1901, 9999, 9999, 9999, 9999, 9999, 7486, 9999, 9999, 9999, 9999]


## 1.3 Dataset
1. fixed length sentences
2. torch format

In [18]:
class SentenceDataset(Dataset):
    def __init__(self, tokenizedCorpus: TokenizedCorpus, split: str, length: int, vocabSize: int):
        assert split in {'train', 'test'} # borrowed from minGPT
        self.split = split
        self.length = length
        self.vocabSize = vocabSize
        self.corpus = tokenizedCorpus.copy()

        if self.split == 'train':
            self.size = int(len(self.corpus) * 0.7)
        else:
            self.size = len(self.corpus) - int(len(self.corpus) * 0.7)
            
        self.ids = self._generateIds()
        self._reshapeCorpus()
        

    def _reshapeCorpus(self):
        # converts to fixed length sentences by clipping or padding
        for idx in tqdm(self.ids, desc=f"reshaping {self.split} corpus"):
            self.corpus[idx] = self._reshapeSentence(self.corpus[idx])
        pass

    def _reshapeSentence(self, sentence: TokenizedSentence) -> TokenizedSentence:
        if len(sentence) >= self.length:
            return sentence[:self.length]
        # we pad with zeros. # zero not in vocab
        return sentence + [0] * (self.length - len(sentence))
        

    def _generateIds(self) -> List[Index]:
        # we just get the top for train and bot for test.
        if self.split == 'train':
            return list(range(self.size))
        else:
            start = int(len(self.corpus) * 0.7) # train index ends before start
            return [i+start for i in range(self.size)]

    def getVocabSize(self) -> int:
        return self.vocabSize

    def getBlockSize(self) -> int:
        # as this is an encoder. we feed n inputs, n-1 outputs, and then read outputs? need more clarifications.
        return self.vocabSize * 2 - 1
        

    def __len__(self):
        return self.size

    def __getitem__(self, idx: Index) -> TokenizedSentence:
        return self.corpus[self.ids[idx]] # ids can be sparse

pipeline.createDatasets = lambda tokenizedCorpus: (SentenceDataset(tokenizedCorpus, "train", pipeline.sentenceLength, pipeline.vocabSize), SentenceDataset(tokenizedCorpus, "test", pipeline.sentenceLength, pipeline.vocabSize))
    
        

In [19]:
# trainSet = SentenceDataset(tokenizedCorpus, "train", 100)
# testSet = SentenceDataset(tokenizedCorpus, "test", 100)
trainSet, testSet = pipeline.createDatasets(tokenizedCorpus)

reshaping train corpus: 100%|████████████████████████████████████████████████| 90979/90979 [00:00<00:00, 269965.73it/s]
reshaping test corpus: 100%|█████████████████████████████████████████████████| 38992/38992 [00:00<00:00, 433116.35it/s]


In [20]:
assert trainSet.ids[-1] + 1 == testSet.ids[0]
assert testSet.ids[-1] + 1 == len(tokenizedCorpus)

## 2. Embedding
We will learn both token embedding and positional encoding. However, we can also use pretrained embedding and trigonometric functions (sine, cosine) from the original Transformer for faster training

In [21]:
# tokenEmbedding = nn.Embedding(vocabSize, embeddingDim)
# posEmbedding = nn.Embedding(sentenceLength, embeddingDim) # number of positions. the dim is the same as tokens as they will be summed
pipeline.createEmbedding = lambda : (nn.Embedding(pipeline.vocabSize, pipeline.embeddingDim), nn.Embedding(pipeline.sentenceLength, pipeline.embeddingDim))
tokenEmbedding, posEmbedding = pipeline.createEmbedding()

In [22]:
tokenEmbedding, posEmbedding

(Embedding(10000, 50), Embedding(100, 50))

## 3. The Single Head
**From this point on, everything will have a batch dimension at 0**
1. Input Embedding
2. Self-Attention

### 3.1 Input Embedding
1. Convert the sentence token batch to embedding batch
2. Create the casual masked batch for every sentence embedding so that the future tokens are not visible. Mask is added after attention scores are computed to save computation
   

In [68]:
class InputProcessor:
    def embed(self, batch: List[TokenizedSentence]) -> SequenceBatch:
        pass

class SingleHead(nn.Module):
    def __init__(self, inputSize: int, hiddenSize: int):
        super().__init__()
        
        self.inputSize = inputSize
        self.hiddenSize = hiddenSize
        self.dk = self.hiddenSize
        # stacked ffns for Q, K, V projections
        self.qkvProjections = nn.Linear(inputSize, 3 * hiddenSize) # [0- hiddenSize] for q, [hiddenSize - 2* hiddenSize] for k, [2* hiddenSize - 3 * hiddenSize] for v.
        self.register_buffer(mask
        
        
    def scaledDotProduct(self, Q: BatchQueries, K: BatchKeys, V: BatchValues, mask: Optional[AttentionMask]) -> BatchAttentions:
        # Three steps of self.attention
        # 1. QK_T
        # 2. Apply attention mask on QK_T
        # 3. softmax
        # 4. calculate attention value QK_T V
        
        # qkT = torch.matmul(Q, K.transpose(-2, -1) # batch, seqlen, d_k. we transpose the last two dimensions of K
        # batch, seqlen, d_k. we transpose the last two dimensions of K
        QK_T = Q @ K.transpose(-2, -1) # @ is a matmul operator 
        QK_T /= self.dk
        print("QK_T", QK_T.shape)

        if mask is not None:
            # now there are three ways we can apply mask, and 1 way we cannot
            # we cannot zero out the mask values as our weight loggits can be negative, so the softmax will significantly change if we zero out the masked loggits
            # we can set masked attentions to -inf with masked_fill 
            # or we can add -inf to the masked positions
            # with fill
            maskCondition = mask == 0
            QK_T = QK_T.masked_fill(maskCondition, -1e9) # may we get overflow? depends on our float size
            # with add
            # QK_T -= (1 - mask) * 1e9

        attn = F.softmax(QK_T, dim=-1) # last dim has each query scores
        return attn @ V # batch, seqlen, 

        
        
        


        

        
        
    

In [24]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [43]:
mask = torch.tril(torch.ones(3, 3)).view((3,3,1))
mask

tensor([[[1.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [0.]],

        [[1.],
         [1.],
         [1.]]])

In [62]:
Q = torch.tensor([[[1, 2, 3], [4, 5, 6]]], dtype=torch.float) #(seqlen, embeddingDim)
K = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float)
Q, K

(tensor([[[1., 2., 3.],
          [4., 5., 6.]]]),
 tensor([[[1., 1., 1.],
          [2., 2., 2.]]]))

In [66]:
torch.matmul(Q, K.transpose(-2, -1))

tensor([[[ 6., 12.],
         [15., 30.]]])

In [64]:
a = np.asarray([[[1, 2, 3], [4, 5, 6]]])
b = np.asarray([[[1, 1, 1], [2, 2, 2]]])
np.dot(a, b.T)

ValueError: shapes (1,2,3) and (3,2,1) not aligned: 3 (dim 2) != 2 (dim 1)

In [70]:
singleHead = SingleHead(sentenceLength, hiddenSize=3) 
Q = torch.tensor([[[1, 2, 3], [4, 5, 6]]], dtype=torch.float) #(seqlen, embeddingDim)
K = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float)
V = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float)
attentions = singleHead.scaledDotProduct(Q, K, V, None)

TypeError: scaledDotProduct() takes 4 positional arguments but 5 were given

In [72]:
mask = torch.tril(torch.ones(3, 3)).view((1,3,3))
mask

tensor([[[1., 0., 0.],
         [1., 1., 0.],
         [1., 1., 1.]]])

In [77]:
attn = torch.tensor([[1, 0.5, 0.25], [0.5, 1, 0.7], [0.1, 0.2, 1]], dtype=torch.float)
attn = attn.unsqueeze(dim=0) # adding the batch dim
attn, attn.shape

(tensor([[[1.0000, 0.5000, 0.2500],
          [0.5000, 1.0000, 0.7000],
          [0.1000, 0.2000, 1.0000]]]),
 torch.Size([1, 3, 3]))