# Transformers!

Another architecture extensively used in NLP...A transformer can be viewed as a stack of encoders and decoders. I will be using the same dataset (English to Hindi Translation) but now I will be using a transformer to do so.

More articles to read for a better view:

http://jalammar.github.io/illustrated-transformer/

In [1]:
# for file I/O
import pandas as pd
import numpy as np
import os
import time


# for dataset and dataloader...
from collections import Counter
from torch.utils.data import Dataset,DataLoader,Subset
from torch.utils.data.dataset import random_split

# for model creation...
import torch
import torch.nn as nn
import torch.nn.functional as F


# for model training...
import torch.optim as optim
from tqdm import tqdm,tqdm_notebook

tqdm.pandas()


In [2]:
#DATA_PATH = os.path.join(os.getcwd(),"data","english_to_hindi.txt")
DATA_PATH = "D:\\PROJECTS\\Github\\nlp-basics\\data\\english_to_hindi.txt"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
def readFile(path,chkNa=True):
    """
        Load data from a text file. The file must have Lang1(delimiter)Lang2 in each row.
        Eg : "Hello Hallo" or "Hello Ola" {here, delimiter was space} 
    """
    try:
        df = pd.read_csv(path,header=None,sep="\t",names=["EN","HI"])
        if chkNa:
            print(df.isna().sum())
        return df
    except FileNotFoundError:
        print(f"{path} does not specify a text file.")    
    except OSError:
        print(f"{path} does not exist")

#checking to make sure...
df = readFile(DATA_PATH)
df.head()

EN    0
HI    0
dtype: int64


Unnamed: 0,EN,HI
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [4]:
def clean(txt):
    unwanted = "~|\\/_।.?,*@#$%^&(){}[]=+\"-'"
    for char in unwanted:
        txt = txt.replace(char,' ')
    return txt

def tokenize(txt):
    txt = clean(txt) 
    tokens = txt.split()
    return tokens

for i in range(2008,2010):
    #print(df["HI"][i])
    print(tokenize(df["HI"][i]))
    print(tokenize(df["EN"][i]))

['मुझे', 'टिकटें', 'कहाँ', 'से', 'लेनीं', 'होंगीं']
['Where', 'should', 'I', 'pick', 'the', 'tickets', 'up']
['तुम', 'आज', 'सुबह', 'यहाँ', 'क्यों', 'आए']
['Why', 'did', 'you', 'come', 'here', 'this', 'morning']


In [5]:
class EngHinData(Dataset):    
    def __init__(self,path,maxVocabSize=500):
        """
            Read a text file from path and generate the input and target sequences
            Also generate english and hindi vocabulary with a max size.
            The most commonly occuring words are chosen.
        """
        self.maxVocabSize = maxVocabSize
        
        df = readFile(path,chkNa=False)
        self.df = self.tokenizeDf(df)
        
        #Generate a vocabulary for both languages...
        enVocab = self.mostFreqTokens(self.df.ENTokenized.tolist())
        hiVocab = self.mostFreqTokens(self.df.HITokenized.tolist())
        
        #Replace rare tokens with "<UNK>"
        self.replaceRareTokens(self.df)
        #Impute zero length targets...
        self.findZeroTargets()
        #Remove all datarows with >20% unknowns...
        self.df = self.removeHighUnk(self.df)
        
        # Create char maps and reverse char maps
        self.enEncoder,self.enDecoder = self.generateMaps(enVocab,rev=True)
        self.hiEncoder,self.hiDecoder = self.generateMaps(hiVocab,rev=True)
        
        # Add <BEG> and <END> to all tokens...
        self.appendExtras(self.df)
        
        # change tokens to indices...
        self.token2idx(self.df)
        
        # Drop all columns except num...
        self.df.drop(["level_0","index","EN","HI","ENTokenized","HITokenized"],axis=1,inplace=True)
        self.df.reset_index(inplace=True)
        
        
    
    def __getitem__(self,i):
        return self.df.ENNum[i],self.df.HINum[i]
    
    def __len__(self):
        return self.df.shape[0]
    
    
    def token2idx(self,df):
        df["ENNum"] = df.ENTokenized.apply(lambda tokenList: [self.enEncoder[token] for token in tokenList])
        df["HINum"] = df.HITokenized.apply(lambda tokenList: [self.hiEncoder[token] for token in tokenList])
    
    
    def appender(self,tokenList):
        tokenList.insert(0,"<BEG>")
        tokenList.append("<END>")
        return tokenList
    
        
    def appendExtras(self,df):
        """
            Adds <BEG> and <END> at the start and end of each tokenList
        """
        
        
        df.ENTokenized.apply(self.appender)
        df.HITokenized.apply(self.appender)
        
        
    
    def generateMaps(self,vocab,rev=False):
        """
            Generates a dictionary {char : idx}
            If rev is set to True, a reverse map will also be generated {idx : char}
        """
        extras = ["<PAD>","<BEG>","<END>","<UNK>"]    
        charMap = {char : idx for idx,char in enumerate(vocab)}
        for extra in extras:
            charMap[extra] = len(charMap)
        
        if not rev:
            return charMap
        else:
            revCharMap = {idx : char for char,idx in charMap.items()}
            return charMap,revCharMap 
        
    
    def tokenizeDf(self,df):
        df["ENTokenized"] = df.EN.apply(tokenize)
        df["HITokenized"] = df.HI.apply(tokenize)
        return df
    
    def replaceRareTokens(self,df):
        commonInputs = self.mostFreqTokens(df.ENTokenized.tolist())
        commonTargets = self.mostFreqTokens(df.HITokenized.tolist())
        
        df.loc[:, 'ENTokenized'] = df.ENTokenized.apply(
            lambda tokens: [token if token in commonInputs 
                            else "<UNK>" for token in tokens]
        )
        df.loc[:, 'HITokenized'] = df.HITokenized.apply(
            lambda tokens: [token if token in commonTargets
                            else "<UNK>" for token in tokens]
        )
        
    
    def mostFreqTokens(self,sequence):
        allTokens = [word for sent in sequence for word in sent]
        common_tokens = set(list(zip(*Counter(allTokens).most_common(self.maxVocabSize - 4)))[0])
        return common_tokens
    
    def removeHighUnk(self, df, threshold=0.8):
        """Remove sequences with mostly <UNK>."""
        calculate_ratio = (
            lambda tokens: sum(1 for token in tokens if token != '<UNK>')/ len(tokens) > threshold
        )
        
        df = df[df.ENTokenized.apply(calculate_ratio)]
        df = df[df.HITokenized.apply(calculate_ratio)]
        df.reset_index(inplace=True)
        return df
    
        
    def findZeroTargets(self):
        badVals = []
        for i,val in enumerate(self.df.HITokenized.values):
            if len(val)==0:
                badVals.append(i)
        
        print(f"Found {len(badVals)} bad values...Imputing them...")
        self.df.drop(badVals,axis=0,inplace=True)
        self.df.reset_index(inplace=True)

In [6]:
ds = EngHinData(DATA_PATH,10000)

Found 2 bad values...Imputing them...


In [7]:
train_size = int(0.99 * len(ds))
test_size = len(ds) - train_size
train_ds, test_ds = torch.utils.data.random_split(ds, [train_size, test_size])

In [8]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate(batch):
    
    inputs = [torch.LongTensor(item[0]) for item in batch]
    targets = [torch.LongTensor(item[1]) for item in batch]
    
    
    # Pad sequencse so that they are all the same length (within one minibatch)
    padded_inputs = pad_sequence(inputs, padding_value=ds.enEncoder["<PAD>"], batch_first=True)
    padded_targets = pad_sequence(targets, padding_value=ds.hiEncoder["<PAD>"], batch_first=True)
    
    
    # Sort by length for CUDA optimizations
    lengths = torch.LongTensor([len(x) for x in inputs])
    lengths, permutation = lengths.sort(dim=0, descending=True)

    return padded_inputs[permutation].to(device), padded_targets[permutation].to(device), lengths.to(device)


batchSize = 512
train_dl = DataLoader(train_ds, batch_size=batchSize, collate_fn=collate)
test_dl = DataLoader(test_ds, batch_size=1, collate_fn=collate)

In [9]:
class Embedder(nn.Module):
    
    def __init__(self,vocabSize,embDims):
        super(Embedder,self).__init__()
        self.embDims = embDims
        self.emb = nn.Embedding(vocabSize,embDims)
    
    def forward(self,inputs):
        return self.emb(inputs).cuda()
        

In [10]:
import math
from torch.autograd import Variable

class PositionEmbedding(nn.Module):
    
    def __init__(self,seqLen,embDims,dropout=0.1):
        super(PositionEmbedding,self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        
        pe = torch.zeros(seqLen, embDims)
        
        for pos in range(seqLen):
            for i in range(0, embDims, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/embDims)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/embDims)))
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
        
    def forward(self,inputs):
        
        # inputs(torch.Tensor): (batchSize x seqLen x embDims)
    
        seqLen = inputs.size(1)
        pe = Variable(self.pe[:,:seqLen],requires_grad=False).cuda()
        inputs = inputs + pe
        return self.dropout(inputs).cuda()        

In [11]:
def createMask(inputs,targets,opt):
    inputMask = (inputs != opt[0]).unsqueeze(-2)

    if targets is not None:
        targetMask = (targets != opt[1]).unsqueeze(-2)
        size = targets.size(1)
        peekMask = getPeekMask(size,opt).cuda()
        targetMask = targetMask & peekMask
    
    else:
        targetMask = None

    return inputMask,targetMask

def getPeekMask(size,opt):

    # Get upper triangle of ones...
    peekMask = np.triu(np.ones((1,size,size)),k=1).astype('uint8')
    # create the peek mask...
    peekMask = Variable(torch.from_numpy(peekMask) == 0).cuda()
    return peekMask



## Multi-head Attention layer

Its a way of trying to find the word in a source language most important in current context for the target language.

For eg:

        **En**                **De**
The cat ate the mouse.          Die Katze aß die Maus.

Here, if the machine was predicting the 2nd word (Katze), the word to pay "attention" to in the source sentence would be "Cat".
For more complicated sentences, one word can have multiple such words to pay attention to. This "selection" of words is done by the Attention layer. When we use multiple such "selectors" each trainable according to the data, we get a multi-head attention layer.

A "single head" Attention layer uses three values to achieve this:
1. K or keys
2. V or values
3. Q or queries


Q is the embedded input recieved, K and V are learned parameters. We take inner product of K and Q, apply softmax and multiply the value to V to get the attention vector...



In [12]:
class MultiHeadAttention(nn.Module):
    """
        A multi-head attention layer as shown in the 2017 paper "Attention is all you need"
    """
    def __init__(self,nHeads,embDims,dropout=0.2):
        """
            The layer is basically a [batchSize x nHeads x seqLen x (embDims/nHeads)] tensor
        """
        super(MultiHeadAttention,self).__init__()

        self.embDims = embDims
        self.splitDims = embDims//nHeads
        self.nHeads = nHeads

        self.queries = nn.Linear(embDims,embDims)
        self.keys = nn.Linear(embDims,embDims)
        self.values = nn.Linear(embDims,embDims)

        self.dropout = nn.Dropout(dropout)
        self.finalFC = nn.Linear(embDims,embDims)

    
    def forward(self,q,k,v,mask=None):
        """ 
            Define the forward pass...
        """
        batchSize = queries.size(0)

        k = self.keys(k).view(batchSize,-1,self.nHeads,self.splitDims)
        q = self.queries(q).view(batchSize,-1,self.nHeads,self.splitDims)
        v = self.keys(v).view(batchSize,-1,self.nHeads,self.splitDims)

        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        score = self.attention(q,k,v,mask)
        out = score.transpose(1,2).contiguous().view(batchSize,-1,self.embDims)
        return self.finalFC(out)


    def attention(q,k,v,mask):
        score = torch.matmul(q,k.transpose(-2,-1)) / math.sqrt(self.splitDims)
        # If the mask is present...
        if mask is not None:
            mask.unsqueeze(1)
            score = score.masked_fill(mask==0, -1e9)
        
        score = F.softmax(score,dim=-1)
        score = self.dropout(score)
        return torch.matmul(score,v)


In [13]:
class FeedFwd(nn.Module):
    def __init__(self,embDims,ffSize=1024,dropout=0.1):
        super(FeedFwd,self).__init__()

        self.fc1 = nn.Linear(embDims,ffSize)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(ffSize,embDims)

    def forward(self,inputs):
        tmp = F.relu(self.fc1(inputs))        
        return self.fc2(self.dropout(tmp))


In [14]:
class Normalizer(nn.Module):

    def __init__(self,size,epsilon=1e-6):
        super(Normalizer,self).__init__()
        self.size = size
        self.a = nn.Parameter(torch.ones(size))
        self.b = nn.Parameter(torch.zeros(size))
        self.epsilon = epsilon

    def forward(self,inputs):
        # find out the mean and std dev...
        mu = x.mean(dim=-1,keepdim=True)
        sigma = x.std(dim=-1,keepdim=True)

        # Add two learnable parameters to the Z score...
        return self.a * (x - mu)/(sigma+self.epsilon) + self.b

In [15]:
class Encoder(nn.Module):

    def __init__(self,embDims,nHeads,dropout=0.1):
        super(Encoder,self).__init__()


        self.norm1 = Normalizer(embDims)
        self.norm2 = Normalizer(embDims)

        self.MHA = MultiHeadAttention(nHeads,embDims,dropout)
        self.fc = FeedFwd(embDims)
        
        self.drop = nn.Dropout(dropout)
    
    def forward(self,inputs,mask):
        # First normalize the data...
        inputNorm1 = self.norm1(inputs)
        # Send this to attention layer...
        attn = self.MHA(inputNorm,inputNorm,inputNorm,mask)
        #Add a shortcut path...
        inputs = inputs + attn
        #Normalize the data again...
        inputNorm2 = self.norm2(inputs)
        # Repeat with feed fwd layer...
        return (inputs + self.drop(self.fc(inputNorm2)))

In [16]:
class Decoder(nn.Module):

    def __init__(self,nHeads,embDims,dropout=0.1):
        super(Decoder,self).__init__()
        self.norm1 = Normalizer(embDims)
        self.norm2 = Normalizer(embDims)
        self.norm3 = Normalizer(embDims)


        self.MHA1 = MultiHeadAttention(nHeads,embDims,dropout)
        self.MHA2 = MultiHeadAttention(nHeads,embDims,dropout)
        self.fc = FeedFwd(embDims)
        
        self.drop = nn.Dropout(dropout)

    def forward(self,inputs,encOutputs,srcMask,trgMask):
        # Normalize the data...
        inputNorm1 = self.norm1(inputs)
        # pass it to first MHA...
        inputs = inputs + self.MHA1(inputNorm1,inputNorm1,inputNorm1,trgMask)
        inputNorm2 = self.norm2(inputs)

        #pass it to MHA that takes encOutput...
        inputs = inputs + self.MHA2(inputNorm2,encOutputs,encOutputs,srcMask)
        inputNorm3 = self.norm3(inputs)

        #pass it through the feew forward layer... 
        return (inputs + self.drop(self.fc(inputNorm3)))
        

In [17]:
import copy
def cloneLayer(module,N):
    """
        Creates N deep copies of a "module" layer 
    """
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [18]:
class EncoderStack(nn.Module):
    def __init__(self,vocabSize,seqLen,embDims,nHeads,stackSize):
        super(EncoderStack,self).__init__()
        self.stackSize = stackSize

        # The embedding layers...
        self.emb = Embedder(vocabSize,embDims)
        self.pe = PositionEmbedding(seqLen,embDims)

        # The stack of encoders...
        self.layers = cloneLayer(Encoder(embDims,nHeads),stackSize)

        # A final normalizer...
        self.norm = Normalizer(embDims)

    
    def forward(self,inputs,mask):
        
        inputs = self.emb(inputs)
        inputs = self.pe(inputs)
        
        for layer in self.layers:
            inputs = layer(inputs,mask)

        return self.norm(inputs)


In [19]:
class DecoderStack(nn.Module):
    def __init__(self,seqLen,vocabSize,embDims,nHeads,stackSize):
        super(DecoderStack,self).__init__()
        self.stackSize = stackSize

        self.emb = Embedder(vocabSize,embDims)
        self.pe = PositionEmbedding(seqLen,embDims)

        self.layers = cloneLayer(Decoder(embDims,nHeads),stackSize)

        self.norm = Normalizer(embDims)
    
    def forward(self,inputs,encOutputs,srcMask,trgMask):

        inputs = self.emb(inputs)
        inputs = self.pe(inputs)

        for layer in self.layers:
            inputs = layer(inputs,encOutputs,srcMask,trgMask)
        
        return self.norm(inputs)

## The transformer....finally!

After hours of solving problems, fixing tensor sizes and scouring through SO for the solutions, finally its here :')

In [20]:
class Transformer(nn.Module):
    def __init__(self,srcVocab,trgVocab,seqLen,embDims,nHeads,stackSize):
        super(Transformer,self).__init__()

        self.encoder = EncoderStack(srcVocab,seqLen,embDims,nHeads,stackSize)
        self.decoder = DecoderStack(trgVocab,seqLen,embDims,nHeads,stackSize)
        self.out = nn.Linear(embDims,trgVocab)

    
    def forward(self,inputs,targets,srcMask,trgMask):
        self.encOutputs = self.encoder(inputs,srcMask)
        self.decOutputs = self.decoder(inputs,encOutputs,srcMask,trgMask)
        outputs = self.out(decOutputs)
        return outputs


In [21]:
embDims = 300
nHeads = 8
stackSize = 6

x,y,l = next(iter(train_dl))

seqLen = x.size(1)
srcVocab = ds.maxVocabSize
trgVocab = ds.maxVocabSize

myModel = Transformer(srcVocab,trgVocab,seqLen,embDims,nHeads,stackSize)
for p in myModel.parameters():
    if p.dim() > 1:
        nn.init.kaiming_normal_(p)

optim = torch.optim.Adam(myModel.parameters(),lr=0.001)

In [22]:
def trainModel(model,train_dl,optimizer,epochs):
    model.train()
    start = time.time()
    temp = start
    totalLoss = []
    progressBar = tqdm_notebook(train_dl)
    opt = [ds.enEncoder["<PAD>"],ds.hiEncoder["<PAD>"]]


    for epoch in range(epochs):
        epochLoss = 0
        for x,y,l in progressBar:
            optimizer.zero_grad()

            x.transpose(0,1)
            y.transpose(0,1)
            yIn = y[:,:-1]
            y = y[:,1:].contiguous().view(-1)

            srcMask,trgMask = createMask(x,yIn,opt)

            preds = model(x,yIn,srcMask,trgMask)
            loss = F.cross_entropy(preds.view(-1,preds.size(-1)),y,ignore_index=trg_pad)
            loss.backward()
            progressBar.set_description(f"Loss : {loss.item():.3f}")
            optimizer.step()
            epochLoss += loss.item()
        
        totalLoss.append(epochLoss)


trainModel(myModel,train_dl,optim,5)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'