<a href="https://colab.research.google.com/github/blooming-ai/generativeai/blob/main/text/byte_pair_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Models an Introduction


Most of the code snippets are taken from https://github.com/karpathy/minGPT

## Word embedding
### The following BPE code is adapted from paper:
#### [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/pdf/1508.07909.pdf)


In [None]:
import re
import pdb
import string
from collections import defaultdict

#Byte pair encoding
def word_to_charachter_tuple(word:str)->tuple:
    '''
    Converts a word into a tuple of characters along with an end character.
    "word" -> ('w','o','r','d','</w>')
    '''
    word.strip()
    word = "".join(ch for ch in word if ch.isalnum()) # keep only alpha-numeric characters
    _lst = list(word.lower())
    _lst.append("<\w>") # add end of word
    return tuple(_lst)

def get_pairs(word_as_tuple:tuple)->list:
    '''
    returns ('w','o','r','d','</w>') -> [('w','o'),('o,'r'),('r','d'),('d','</w>')]
    '''
    output = []
    for i in range(len(word_as_tuple)-1):
        output.append((word_as_tuple[i],word_as_tuple[i+1]))
    return output

def replace_pair(word_as_tuple:tuple, pair:tuple)->tuple:
    '''
    Given word = ('w','o','r','d','</w>') and pair = ('o,'r')
    returns ('w','or','d','</w>'). Replacement happens for each occurance of the pair
    '''
    word = word_as_tuple
    new_word = list()
    is_last_char_used = False
    i=0
    while i < len(word)-1:
        if (word[i],word[i+1]) == pair:
            new_word.append( word[i]+word[i+1] )
            if i == len(word)-2: is_last_char_used = True
            i += 1 # skip the next merged character
        else:
            new_word.append(word[i])

        i += 1

    if not is_last_char_used : new_word.append(word[len(word)-1])

    return tuple(new_word)

def construct_word_vocab(word_vocab:defaultdict, file_path:str)->dict:
    '''
    Read file and update word_vocab dict. word_vocab has format word_vocab[('w','o','r','d')]->freq
    '''
    with open(file_path) as fp:
        for line in fp:
            for item in line.split(): #split ignore multiple spaces
                item_as_tuple = word_to_charachter_tuple(item) # tuple to make a hashable dict key
                if len(item_as_tuple) == 0: continue #ignore empty key
                word_vocab[item_as_tuple] += 1

    return word_vocab

def get_byte_pair_hist(word_vocab:defaultdict)->dict:
    '''
    Read word_vocab[('w','o','r','d','\w')]->freq and construct byte pair histogram
    returns pair[('w','o')]->freq
    '''
    pair = defaultdict(int)
    for word, freq in word_vocab.items():
        for bigram in get_pairs(word):
            pair[bigram] += freq

    return pair

def merge_pair(pair:tuple, word_vocab_in:dict)->dict:
    '''
    merge the input pair in the key of the word_vocab dict. E.g. pair=('w','o') then update
    word_vocab_in[('w','o','r','d')]->freq to
    word_vocab_in[('wo','r','d')]->freq
    '''
    word_vocab_out = {}
    for word, freq in word_vocab_in.items():
        new_word = replace_pair(word, pair)
        word_vocab_out[new_word] = freq
    return word_vocab_out

def byte_pair_encoding(word_vocab:defaultdict, n:int=10)->dict:
    '''
    Given word_vocab[('w','o','r','d')]->freq merges pairs with highest frequency 'n' times.
    E.g. a merge involves replacing ('w','o'), bigram with highest freq., as word_vocab_in[('wo','r','d')]->freq.
    returns:
    Merge rank bpe_ranks[ ('w','o') ]-> 0 (implies first merge),
    Merged word vocab - word_vocab[('wo','r','d')]->freq
    '''
    i = 0
    merges = []
    for i in range(n):
        pairs = get_byte_pair_hist(word_vocab)
        best = max(pairs, key=pairs.get)
        word_vocab = merge_pair(best, word_vocab)
        merges.append(best)

    # bpe merge list that defines the bpe "tree", of tuples (a,b) that are to merge to token ab
    bpe_ranks = dict(zip(merges, range(len(merges))))
    return bpe_ranks, word_vocab

def get_bpe_encoder_decoder_map(word_vocab:defaultdict)->tuple:
    '''
    Given word_vocab[('wo','r','d')]->freq of merged words
    returns:
    Encoder encoder['wo')] -> id
    Decoder decoder[ id ] -> 'wo'
    '''
    # assign an id for each token
    bpe_encoder = {}; bpe_decoder = {}; id = 0
    for key, value in word_vocab.items():
        for token in key:
            if token not in bpe_encoder:
                bpe_encoder[token] = id
                bpe_decoder[id] = token
                id+=1

    return bpe_encoder, bpe_decoder



In [None]:
#Get data file
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!mv input.txt sample_data/

In [None]:
file_path = "sample_data/input.txt"

In [None]:
#Testing:
word_vocab = defaultdict(int)
word_vocab = construct_word_vocab(word_vocab, file_path)
print("is empty string a key in word_vocab:",() in word_vocab)
pairs = get_byte_pair_hist(word_vocab)
best = max(pairs, key=pairs.get)
word_vocab_out = merge_pair(('f','i'), word_vocab)

print("word vocab -> ",word_vocab)
print("byte pair hist -> ",pairs)
print("best pair -> ",best)
print("merge f,i-> ",word_vocab_out)





In [None]:
# initialize word_vocab
initial_vocab = list(string.ascii_lowercase)
initial_vocab.extend(list(string.digits))
initial_vocab = [(str(val)) for val in initial_vocab] # convert each key to a tuple
word_vocab = defaultdict(int,zip(initial_vocab, [1]*len(initial_vocab)))

#construct from a file
word_vocab = construct_word_vocab(word_vocab, file_path)
merge_ranks, word_vocab = byte_pair_encoding(word_vocab, 200)
bpe_encoder, bpe_decoder = get_bpe_encoder_decoder_map(word_vocab)

print("number of unique words: ",len(word_vocab))
print("number of tokens: ", len(bpe_encoder))
print("bpe encoder map-> ", bpe_encoder)
print("bpe decoder map-> ", bpe_decoder)
print("merges -> ", merge_ranks)


In [None]:
cache={}
def bpe_tokenize(input:str, merge_ranks:dict, bpe_encoder:dict)->list:
    tokens = []
    for word in input.split():
        if word in cache: return cache[word]

        word_tuple = word_to_charachter_tuple(word)
        while True:
            if len(word_tuple) == 1: break #Cannot get pair from a single element
            pairs = get_pairs(word_tuple)
            bigram = min(pairs, key = lambda pair: merge_ranks.get(pair, float('inf'))) # find the next lowest rank bigram that can be merged
            if bigram not in merge_ranks: break # no more bigrams are eligible to be merged
            word_tuple = replace_pair(word_tuple, bigram)

        for token in word_tuple:
            if token in bpe_encoder: tokens.append( bpe_encoder[token] )
            else: raise Exception("unknown token: "+ token)

    return tokens


In [None]:
#Test
line = "I am tokenizing"
tokens = bpe_tokenize(line, merge_ranks, bpe_encoder)
print("Tokens ids -> ",tokens)
print("Tokens -> ",[bpe_decoder[key] for key in tokens])
reconstruction = [bpe_decoder[key] for key in tokens]
reconstruction = "".join(reconstruction)
print("Reconstruction-> ",reconstruction.replace('<\w>', ' '))

## Positional Encoding
##### There are many position encoding methods. We will discuss


*   Sinusoidal Position Encoding ([Explanation](https://timodenk.com/blog/linear-relationships-in-the-transformers-positional-encoding/))
*   Relative Position Encoding ([Visualization](https://www.lesswrong.com/posts/qvWP3aBDBaqXvPNhS/gpt-2-s-positional-embedding-matrix-is-a-helix))




In [None]:
#Sinusoidal position encoding
import torch

def positional_embedding(n_pos:int = 32, d_model:int = 64)->torch.tensor:
    position = torch.arange(n_pos).unsqueeze(1)
    i = torch.arange(d_model).unsqueeze(0)
    deno = 1/ torch.pow(10000, (2 * (i // 2) / d_model))
    pos_mat = position*deno
    pos_mat[:,0::2] = torch.sin(pos_mat[:,0::2])
    pos_mat[:,1::2] = torch.sin(pos_mat[:,0::2])
    return pos_mat

print(positional_embedding().size())


## Transformer

In [None]:
#Code taken from https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
import math

import torch
import torch.nn as nn
from torch.nn import functional as F
class GELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class Causal_Self_Attention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class Transformer_Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = Causal_Self_Attention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd),
            c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
            act     = GELU(),
            dropout = nn.Dropout(config.resid_pdrop),
        ))
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x



## Network Setup

In [None]:
class Config(object): pass

config = Config()

#Model parametes
config.n_layer = 6
config.n_head = 2
config.n_embd =  32
# these options must be filled in externally
config.vocab_size = len(bpe_encoder)
config.block_size = None
# dropout hyperparameters
config.embd_pdrop = 0.1
config.resid_pdrop = 0.1
config.attn_pdrop = 0.1


    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.block_size = config.block_size

        type_given = config.model_type is not None
        params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
        assert type_given ^ params_given # exactly one of these (XOR)
        if type_given:
            # translate from model_type to detailed configuration
            config.merge_from_dict({
                # names follow the huggingface naming conventions
                # GPT-1
                'openai-gpt':   dict(n_layer=12, n_head=12, n_embd=768),  # 117M params
                # GPT-2 configs
                'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
                'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
                'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
                'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
                # Gophers
                'gopher-44m':   dict(n_layer=8, n_head=16, n_embd=512),
                # (there are a number more...)
                # I made these tiny models up
                'gpt-mini':     dict(n_layer=6, n_head=6, n_embd=192),
                'gpt-micro':    dict(n_layer=4, n_head=4, n_embd=128),
                'gpt-nano':     dict(n_layer=3, n_head=3, n_embd=48),
            }[config.model_type])

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Transformer_Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)


## Load Data

# Train