In [1]:
exec(open('init_notebook.py').read())

current working dirF:\ml-from-scratch


In [2]:
%load_ext autoreload
%autoreload 2

# Resources
1. Generative Deep Learning by David Foster
2. https://github.com/karpathy/minGPT/blob/master/demo.ipynb

**This is the GPT implementation in Pytorch. The book has the tensorflow+keras implementation. Borrowed some ideas from minGPT**

GPT is a encoder only model (pretraining model to be used by different decoders)

# PipeLine
1. Preprocessing Text -> Tokenize -> Sentence Clipper/Padder -> Input Dataset
2. Input Dataset -> token embedding + positional embedding -> input embedding
3. Input Sequence Embeddings -> self-attention A(Q,K,V) -> single head
4. Single Heads -> concat attentions -> Multihead

## Encoder
5. multi-head + query -> layer-norm -> FFNs -> layer-norm -> output next token

In GPT, both token and positional embeddings are learned.


In [152]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import pandas as pd
import os
import re
import string
from typing import *
import numpy.typing as npt
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
import heapq

# Custom types
Index = int
Char = str
Path = str
Sentence = str
TokenizedSentence = List[int]
Word = str
TokenId = int
Corpus = List[Sentence]
TokenizedCorpus = List[TokenizedSentence]
Vocabulary = Dict[Word, TokenId]

In [169]:
# Configurations
dataDir = "E:/Datasets/wine-reviews"
vocabSize = 10_000
sentenceLength = 100

In [178]:
#######################################################################################################
############# The Pipeline class which is the orchestration of each step of the process ###############
#######################################################################################################
class Pipeline:
    def save(self, directory: Path):
        pass
    def preprocess(self, corpus: Corpus) -> Corpus:
        pass
    def tokenize(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        pass
    def createDatasets(self, tokenizedCorpus: TokenizedCorpus, sentenceLength) -> Tuple[SentenceDataset, SentenceDataset]:
        pass
    
pipeline = Pipeline()

## 1.1 Preprocessing
1. read text,
2. pad punctuations with spaces to convert them to individual words.                                                                                                                                                                                                                                                                       

In [6]:
df = pd.read_csv(os.path.join(dataDir, "winemag-data-130k-v2.csv"))

In [33]:
# df.head()

In [85]:
corpus = df["description"].tolist()

In [86]:
# data[:10]

In [175]:
class PreProcessorText:
    
    def padPunk(self, s: Sentence) -> Sentence: # this does not work well. We will do BPE next. isn't -> becomes bad is ' t. now t is a word! It's it ' s
        s = re.sub(f"([{string.punctuation}])", r" \1 ", s) #to words
        s = re.sub(' +', ' ', s) # multiple spaces to one
        return s
    
    def smallCase(self, corpus: List[Sentence]) -> List[Sentence]:
        return [s.lower() for s in corpus]

    def __call__(self, corpus: List[Sentence]) -> List[Sentence]:
        corpus = [self.padPunk(s) for s in corpus] 
        return self.smallCase(corpus)
        
pipeline.preprocess = lambda corpus: PreProcessorText()(corpus)
    
    

In [176]:
corpus = pipeline.preprocess(corpus)

In [165]:
corpus[:3]

["aromas include tropical fruit , broom , brimstone and dried herb . the palate isn ' t overly expressive , offering unripened apple , citrus and dried sage alongside brisk acidity . ",
 "this is ripe and fruity , a wine that is smooth while still structured . firm tannins are filled out with juicy red berry fruits and freshened with acidity . it ' s already drinkable , although it will certainly be better from 2016 . ",
 'tart and snappy , the flavors of lime flesh and rind dominate . some green pineapple pokes through , with crisp acidity underscoring the flavors . the wine was all stainless - steel fermented . ']

## 1.2 Tokenize
1. vocabulary
2. tokenize sentences

In [143]:
class SentenceTokenizer:
    
    def getVocabulary(self, corpus: Corpus, vocabSize: int) -> Vocabulary:
        vectorizer = CountVectorizer().fit(corpus)
        topWords = heapq.nlargest(vocabSize-1, vectorizer.vocabulary_, key=lambda w: vectorizer.vocabulary_[w])
        wordToToken = {}
        for idx, w in enumerate(topWords):
            wordToToken[w] = idx
    
        wordToToken['UNK'] = len(wordToToken)
        print(f"Created a vocabulary with top {len(wordToToken)} words from {len(vectorizer.vocabulary_)} words with UNK as the last word")
        return wordToToken

    def tokenizeSentence(self, sentence: Sentence, vocabulary: Vocabulary) -> TokenizedSentence:
        words = sentence.split()
        tSen = [
            vocabulary[w] if w in vocabulary
            else vocabulary["UNK"]
                for w in words
        ]
        return tSen

    def build(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        vocabulary = self.getVocabulary(corpus, vocabSize)
        tokenizedCorpus = [self.tokenizeSentence(s, vocabulary) for s in corpus]
        return vocabulary, tokenizedCorpus

    def __call__(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        return self.build(corpus, vocabSize)

pipeline.tokenize = lambda corpus, vocabSize: SentenceTokenizer()(corpus, vocabSize)
        



In [144]:
vocabulary, tokenizedCorpus = pipeline.tokenize(corpus, vocabSize)
# sentenceTokenizer = SentenceTokenizer()
# vocabulary, tokenizedCorpus = sentenceTokenizer.build(corpus, vocabSize)
vocabulary["the"]

Created a vocabulary with top 10000 words from 31274 words with UNK as the last word


3415

In [145]:
print(tokenizedCorpus[0]) # mostl UNKs as we have over 30 words

[9999, 9999, 2608, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 3415, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 1901, 9999, 9999, 9999, 9999, 9999, 7486, 9999, 9999, 9999, 9999]


## 1.3 Dataset
1. fixed length sentences
2. torch format

In [179]:
class SentenceDataset(Dataset):
    def __init__(self, tokenizedCorpus: TokenizedCorpus, split: str, length: int):
        assert split in {'train', 'test'} # borrowed from minGPT
        self.split = split
        self.length = length
        self.corpus = tokenizedCorpus.copy()

        if self.split == 'train':
            self.size = int(len(self.corpus) * 0.7)
        else:
            self.size = len(self.corpus) - int(len(self.corpus) * 0.7)
            
        self.ids = self._generateIds()
        self._reshapeCorpus()
        

    def _reshapeCorpus(self):
        # converts to fixed length sentences by clipping or padding
        for idx in tqdm(self.ids, desc=f"reshaping {self.split} corpus"):
            self.corpus[idx] = self._reshapeSentence(self.corpus[idx])
        pass

    def _reshapeSentence(self, sentence: TokenizedSentence) -> TokenizedSentence:
        if len(sentence) >= self.length:
            return sentence[:self.length]
        # we pad with zeros. # zero not in vocab
        return sentence + [0] * (self.length - len(sentence))
        

    def _generateIds(self) -> List[Index]:
        # we just get the top for train and bot for test.
        if self.split == 'train':
            return list(range(self.size))
        else:
            start = int(len(self.corpus) * 0.7) # train index ends before start
            return [i+start for i in range(self.size)]
            

    def __len__(self):
        return self.size

    def __getitem__(self, idx: Index) -> TokenizedSentence:
        return self.corpus[self.ids[idx]] # ids can be sparse

pipeline.createDatasets = lambda tokenizedCorpus, sentenceLength: (SentenceDataset(tokenizedCorpus, "train", sentenceLength), SentenceDataset(tokenizedCorpus, "test", sentenceLength))
    
        

In [180]:
# trainSet = SentenceDataset(tokenizedCorpus, "train", 100)
# testSet = SentenceDataset(tokenizedCorpus, "test", 100)
trainSet, testSet = pipeline.createDatasets(tokenizedCorpus, sentenceLength)

reshaping train corpus: 100%|████████████████████████████████████████████████| 90979/90979 [00:00<00:00, 471388.35it/s]
reshaping test corpus: 100%|██████████████████████████████████████████████████| 38992/38992 [00:00<00:00, 93947.35it/s]


In [181]:
assert trainSet.ids[-1] + 1 == testSet.ids[0]
assert testSet.ids[-1] + 1 == len(tokenizedCorpus)