In [1]:
'''This jupyter notebook contains:
1. Data preprocessing
2. Transformer Encoder
3. Transformer Decoder
4. Training
5. Evaluation'''

'This jupyter notebook contains:\n1. Data preprocessing\n2. Transformer Encoder\n3. Transformer Decoder\n4. Training\n5. Evaluation'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import sys
import random
from torch.utils.data import TensorDataset
from tqdm import tqdm
import copy
import math
import os
import time

In [3]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [4]:
'''function to load the data'''
def load_data(path,language_names):
    df=pd.read_csv(path,header=None)
    df.columns=language_names
    return df

In [5]:
'''Here,basically,the input is given to the encoder in English language and is transliterated to Hindi
by the decoder'''
path_train="hin_train.csv"
language_names = ['English','transliteration_in_hindi']
df_train=load_data(path_train,language_names)
print(df_train.shape)
df_train

(51200, 2)


Unnamed: 0,English,transliteration_in_hindi
0,shastragaar,शस्त्रागार
1,bindhya,बिन्द्या
2,kirankant,किरणकांत
3,yagyopaveet,यज्ञोपवीत
4,ratania,रटानिया
...,...,...
51195,toned,टोंड
51196,mutanaazaa,मुतनाज़ा
51197,asahmaton,असहमतों
51198,sulgaayin,सुलगायीं


In [6]:
path_test="hin_test.csv"
path_validation="hin_valid.csv"
df_validation=load_data(path_validation,language_names)
print('Validation data:',df_validation.shape)
df_test=load_data(path_test,language_names)
print('Test data:', df_test.shape)

Validation data: (4096, 2)
Test data: (4096, 2)


In [7]:
'''Function for acquiring all the characters of the given data'''
def split_words(x):
    x=np.array(x)
    alpha=['_','\t','\n',' '] #pad token, start of word, end of word and unknown tokens respectively
    b=[]
    for i in range(x.shape[0]):
        a=list(x[i])
        for j in range(len(a)):
            if a[j] not in b:
                b.append(a[j])
    b=sorted(b)
    alpha=alpha+b
    return alpha

In [8]:
'''All the english characters are stored into the list english_vocab and all the hindi characters are
stored into the list hindi_vocab'''
english_vocab=split_words(df_train['English'])
hindi_vocab=split_words(df_train['transliteration_in_hindi'])
print('English vocabulary size:', len(english_vocab))
print('Hindi vocabulary size:', len(hindi_vocab))
print(english_vocab)
print(hindi_vocab)

English vocabulary size: 30
Hindi vocabulary size: 68
['_', '\t', '\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['_', '\t', '\n', ' ', 'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्']


In [9]:
'''Functions to create the vocabulary dictionaries with their corresponding indices'''
def int_to_char(vocab):
    int2char={} #padding token, start of word, end of word token and unknown token
    for i in range(len(vocab)):
        int2char[i]=vocab[i]
    return int2char

In [10]:
int2char_eng=int_to_char(english_vocab)
print(int2char_eng)

{0: '_', 1: '\t', 2: '\n', 3: ' ', 4: 'a', 5: 'b', 6: 'c', 7: 'd', 8: 'e', 9: 'f', 10: 'g', 11: 'h', 12: 'i', 13: 'j', 14: 'k', 15: 'l', 16: 'm', 17: 'n', 18: 'o', 19: 'p', 20: 'q', 21: 'r', 22: 's', 23: 't', 24: 'u', 25: 'v', 26: 'w', 27: 'x', 28: 'y', 29: 'z'}


In [11]:
def char_to_int(int2char):
    char2int={ch:ii for ii,ch in int2char.items()}
    return char2int

In [12]:
char2int_eng=char_to_int(int2char_eng)
print(char2int_eng)

{'_': 0, '\t': 1, '\n': 2, ' ': 3, 'a': 4, 'b': 5, 'c': 6, 'd': 7, 'e': 8, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'j': 13, 'k': 14, 'l': 15, 'm': 16, 'n': 17, 'o': 18, 'p': 19, 'q': 20, 'r': 21, 's': 22, 't': 23, 'u': 24, 'v': 25, 'w': 26, 'x': 27, 'y': 28, 'z': 29}


In [13]:
int2char_hin=int_to_char(hindi_vocab)
print(int2char_hin)

{0: '_', 1: '\t', 2: '\n', 3: ' ', 4: 'ँ', 5: 'ं', 6: 'ः', 7: 'अ', 8: 'आ', 9: 'इ', 10: 'ई', 11: 'उ', 12: 'ऊ', 13: 'ऋ', 14: 'ए', 15: 'ऐ', 16: 'ऑ', 17: 'ओ', 18: 'औ', 19: 'क', 20: 'ख', 21: 'ग', 22: 'घ', 23: 'ङ', 24: 'च', 25: 'छ', 26: 'ज', 27: 'झ', 28: 'ञ', 29: 'ट', 30: 'ठ', 31: 'ड', 32: 'ढ', 33: 'ण', 34: 'त', 35: 'थ', 36: 'द', 37: 'ध', 38: 'न', 39: 'प', 40: 'फ', 41: 'ब', 42: 'भ', 43: 'म', 44: 'य', 45: 'र', 46: 'ल', 47: 'ळ', 48: 'व', 49: 'श', 50: 'ष', 51: 'स', 52: 'ह', 53: '़', 54: 'ऽ', 55: 'ा', 56: 'ि', 57: 'ी', 58: 'ु', 59: 'ू', 60: 'ृ', 61: 'ॅ', 62: 'े', 63: 'ै', 64: 'ॉ', 65: 'ो', 66: 'ौ', 67: '्'}


In [14]:
char2int_hin=char_to_int(int2char_hin)
print(char2int_hin)

{'_': 0, '\t': 1, '\n': 2, ' ': 3, 'ँ': 4, 'ं': 5, 'ः': 6, 'अ': 7, 'आ': 8, 'इ': 9, 'ई': 10, 'उ': 11, 'ऊ': 12, 'ऋ': 13, 'ए': 14, 'ऐ': 15, 'ऑ': 16, 'ओ': 17, 'औ': 18, 'क': 19, 'ख': 20, 'ग': 21, 'घ': 22, 'ङ': 23, 'च': 24, 'छ': 25, 'ज': 26, 'झ': 27, 'ञ': 28, 'ट': 29, 'ठ': 30, 'ड': 31, 'ढ': 32, 'ण': 33, 'त': 34, 'थ': 35, 'द': 36, 'ध': 37, 'न': 38, 'प': 39, 'फ': 40, 'ब': 41, 'भ': 42, 'म': 43, 'य': 44, 'र': 45, 'ल': 46, 'ळ': 47, 'व': 48, 'श': 49, 'ष': 50, 'स': 51, 'ह': 52, '़': 53, 'ऽ': 54, 'ा': 55, 'ि': 56, 'ी': 57, 'ु': 58, 'ू': 59, 'ृ': 60, 'ॅ': 61, 'े': 62, 'ै': 63, 'ॉ': 64, 'ो': 65, 'ौ': 66, '्': 67}


In [15]:
'''Finding the maximum sequence length'''
length_eng=[len(i) for i in df_train['English']]
length_hin=[len(i) for i in df_train['transliteration_in_hindi']]

In [16]:
length_eng_max=max(length_eng)+2 #we have to account for the start and end token
print(f'The maximum sequence length of English words is {length_eng_max}')
length_hin_max=max(length_hin)+2
print(f'The maximum sequence length of transliterated words is {length_hin_max}')

The maximum sequence length of English words is 26
The maximum sequence length of transliterated words is 22


In [17]:
def process_data(df,english_vocab=english_vocab,hindi_vocab=hindi_vocab,
                 length_eng_max=length_eng_max,length_hin_max=length_hin_max,char2int_eng=char2int_eng
                 ,char2int_hin=char2int_hin):
    
    '''removing words of length more than max length'''
    df['English'] = df['English'].str.lower()
    df['transliteration_in_hindi'] = df['transliteration_in_hindi'].str.lower()
    df = df[df['English'].apply(len) <= length_eng_max-2]
    df = df[df['transliteration_in_hindi'].apply(len) <= length_hin_max-2]
    
    '''Adding start and end of word tokens'''
    y_og = df['transliteration_in_hindi'].values #The data type of y_og will be numpy array
    x_og = df['English'].values
    x = '\t'+x_og+'\n'
    y = '\t'+y_og+'\n'
    y_do=y_og+'\n' #This is for the decoder output
    unknown=3
    pad=0
    pad_char='_'
    unknown_char=' '
    start=1
    end=2
    
    enc_input_data=torch.zeros(len(x),length_eng_max)
    dec_input_data=torch.zeros(len(y),length_hin_max)
    dec_output_data=torch.zeros(len(y),length_hin_max)
    for i, (xx,yy) in enumerate(zip(x,y)):
        for j,char in enumerate(xx):
            enc_input_data[i,j]=char2int_eng[char]
            
        #pad character is zero so no need of assigning it again
        for j,char in enumerate(yy):
            if char in hindi_vocab:
                dec_input_data[i,j]=char2int_hin[char]
            else:
                dec_input_data[i,j]=char2int_hin[unknown_char] #There are chances that unknown char would come in the test data
    
    for i, (xx,yy) in enumerate(zip(x,y_do)):
        for j,char in enumerate(yy):
            if char in hindi_vocab:
                dec_output_data[i,j]=char2int_hin[char]
            else:
                dec_input_data[i,j]=char2int_hin[unknown_char]
                
    return enc_input_data,dec_input_data,dec_output_data

In [18]:
enc_input_data,dec_input_data,dec_output_data=process_data(df_train)

In [19]:
print(enc_input_data.shape)
print(dec_input_data.shape)
print(dec_output_data.shape)

torch.Size([51200, 26])
torch.Size([51200, 22])
torch.Size([51200, 22])


In [20]:
len(english_vocab)

30

In [21]:
def one_hot_encoding(df,english_vocab=english_vocab,hindi_vocab=hindi_vocab,
                 length_eng_max=length_eng_max,length_hin_max=length_hin_max,char2int_eng=char2int_eng
                 ,char2int_hin=char2int_hin):
    
    
    '''removing words of length more than max length'''
    df = df[df['English'].apply(len) <= length_eng_max-2]
    df = df[df['transliteration_in_hindi'].apply(len) <= length_hin_max-2]
    '''Adding start and end of word tokens'''
    y = df['transliteration_in_hindi'].values
    x= df['English'].values
    x = '\t'+x+'\n'
    y = '\t'+y+'\n'
    
    unknown=3
    pad=0
    pad_char='_'
    unknown_char=' '
    start=1
    end=2
    num_english_tokens = len(english_vocab)
    num_hindi_tokens = len(hindi_vocab)
    
    encoder_input_data = np.zeros(
    (len(df['English']), length_eng_max, num_english_tokens), dtype="float32")
    decoder_input_data = np.zeros(
    (len(df['transliteration_in_hindi']), length_hin_max, num_hindi_tokens), dtype="float32")
    decoder_output_data = np.zeros(
    (len(df['transliteration_in_hindi']), length_hin_max, num_hindi_tokens), dtype="float32")
   
    for i , (input_text,target_text) in enumerate(zip(x,y)):
        for t,char in enumerate(input_text):
            encoder_input_data[i,t,char2int_eng[char]]=1
        encoder_input_data[i,t+1:,char2int_eng[pad_char]]=1
    
        for t,char in enumerate(target_text):
            if char in hindi_vocab:
                decoder_input_data[i,t,char2int_hin[char]]=1
            else:
                decoder_input_data[i,t,char2int_hin[unknown_char]]=1
        decoder_input_data[i,t+1:,char2int_hin[pad_char]]=1
    
        '''decoder target data is one step ahead of decoder input data by one timestep
        and doesnot includes start token'''
        for t,char in enumerate(target_text):
            if t>0:
                if char in hindi_vocab:
                    decoder_output_data[i,t-1,char2int_hin[char]]=1
                else:
                    decoder_output_data[i,t-1,char2int_hin[unknown_char]]=1
                
        decoder_output_data[i,t:,char2int_hin[pad_char]]=1
    
    return torch.tensor(encoder_input_data),torch.tensor(decoder_input_data),torch.tensor(decoder_output_data)
    
    

In [22]:
encoder_input_data,decoder_input_data,decoder_output_data=one_hot_encoding(df_train)

In [23]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_output_data.shape)

torch.Size([51200, 26, 30])
torch.Size([51200, 22, 68])
torch.Size([51200, 22, 68])


In [24]:
enc_input_data_test,dec_input_data_test,dec_output_data_test=process_data(df_test)
enc_input_data_val,dec_input_data_val,dec_output_data_val=process_data(df_validation)
encoder_input_data_test,decoder_input_data_test,decoder_output_data_test=one_hot_encoding(df_test)
encoder_input_data_val,decoder_input_data_val,decoder_output_data_val=one_hot_encoding(df_validation)

In [25]:
print(encoder_input_data_val.shape)
print(decoder_input_data_val.shape)
print(decoder_output_data_val.shape)

torch.Size([4096, 26, 30])
torch.Size([4096, 22, 68])
torch.Size([4096, 22, 68])


In [26]:
print(torch.argmax(decoder_input_data,dim=-1))
print(dec_input_data)

tensor([[ 1, 49, 51,  ...,  0,  0,  0],
        [ 1, 41, 56,  ...,  0,  0,  0],
        [ 1, 19, 56,  ...,  0,  0,  0],
        ...,
        [ 1,  7, 51,  ...,  0,  0,  0],
        [ 1, 51, 58,  ...,  0,  0,  0],
        [ 1,  7,  5,  ...,  0,  0,  0]])
tensor([[ 1., 49., 51.,  ...,  0.,  0.,  0.],
        [ 1., 41., 56.,  ...,  0.,  0.,  0.],
        [ 1., 19., 56.,  ...,  0.,  0.,  0.],
        ...,
        [ 1.,  7., 51.,  ...,  0.,  0.,  0.],
        [ 1., 51., 58.,  ...,  0.,  0.,  0.],
        [ 1.,  7.,  5.,  ...,  0.,  0.,  0.]])


In [27]:
enc_input_data=enc_input_data.long()
dec_input_data=dec_input_data.long()
enc_input_data_test=enc_input_data_test.long()
dec_input_data_test=dec_input_data_test.long()
enc_input_data_val=enc_input_data_val.long()
dec_input_data_val=dec_input_data_val.long()
encoder_input_data=encoder_input_data.long()
decoder_input_data=decoder_input_data.long()
decoder_output_data=decoder_output_data.long()

In [28]:
dec_input_data_val

tensor([[ 1, 26, 44,  ...,  0,  0,  0],
        [ 1, 41, 26,  ...,  0,  0,  0],
        [ 1, 51,  5,  ...,  0,  0,  0],
        ...,
        [ 1, 14, 19,  ...,  0,  0,  0],
        [ 1, 41, 67,  ...,  0,  0,  0],
        [ 1, 21, 65,  ...,  0,  0,  0]])

In [29]:
dec_output_data_val

tensor([[26., 44., 51.,  ...,  0.,  0.,  0.],
        [41., 26., 55.,  ...,  0.,  0.,  0.],
        [51.,  5., 22.,  ...,  0.,  0.,  0.],
        ...,
        [14., 19., 55.,  ...,  0.,  0.,  0.],
        [41., 67., 46.,  ...,  0.,  0.,  0.],
        [21., 65., 48.,  ...,  0.,  0.,  0.]])

In [30]:
'''
Here, the implementation is based on the paper: Attention is all you need
The original base attention transformer has the following structure:

Main dimension of model-embeddings (d_model): 512
Number of attention heads: 8
Number of encoder layers: 6
Number of decoder layers: 6
Hidden dimension of feed-forward layers: 2048
Dropout probability: 0.1
'''

'\nHere, the implementation is based on the paper: Attention is all you need\nThe original base attention transformer has the following structure:\n\nMain dimension of model-embeddings (d_model): 512\nNumber of attention heads: 8\nNumber of encoder layers: 6\nNumber of decoder layers: 6\nHidden dimension of feed-forward layers: 2048\nDropout probability: 0.1\n'

In [82]:
def mask_generator(x, heads, seq_len_dec = None, type = 'encoder_selfattention'):

    '''
    Here, the output will be of the format (bs, heads, seq_len, seq_len), which can be used for masking in the multihead attention block
    before applying softmax function
    types : Encoder self attention(bs, seq_len, enc_seqlen, enc_seqlen)
            Decoder self attention(bs, seq_len, dec_seqlen, dec_seqlen)
            Decoder cross attention(bs, seq_len, dec_seqlen, enc_seqlen)
    '''
    
    if type == 'encoder_selfattention':
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        pad_idx = 0
        mask = (x == pad_idx)
        '''
        Expand the mask for attention (batch_size, num_heads, seq_len, seq_len); after doing the operation q.kT the shape will be this
        '''
        mask = mask.unsqueeze(1).unsqueeze(2).expand(batch_size, heads, seq_len, seq_len).detach().to(device)
        
        
    elif type == 'decoder_selfattention':
        '''Here, this masking is used in decoder to avoid the present tokens seeing the future ones'''
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        mask = torch.triu(torch.ones(seq_len,seq_len)*float('-inf'), diagonal = 1).bool().expand(batch_size, heads, seq_len, seq_len).detach().to(device)

    elif type  == 'decoder_crossattention':
        '''Here, we use the padding_mask of encoder and the sequence length should be that of encoder
        output : (batch size, heads, decoder sequence length, encoder sequence langth'''
        batch_size = x.shape[0]
        seq_len_enc = x.shape[1]
        pad_idx = 0
        mask = (x == pad_idx)
        mask = mask.unsqueeze(1).unsqueeze(2).expand(batch_size, heads, seq_len_dec, seq_len_enc).detach().to(device)
        
    return mask


In [83]:
def positional_embedding(seq_length, d_model):
    '''
    
    shape : (seq_len, d_model)
    Gradient won't flow through positional embedding
    '''
    pe = torch.zeros((seq_length, d_model), requires_grad = False)
    for pos in range(seq_length):
        for i in range(0, d_model, 2):
            pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
            pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
    pe = pe.unsqueeze(0)

    return pe


In [84]:
class multiheadattention(nn.Module):
    
    def __init__(self,d_model, h, dropout):
        
        super(multiheadattention, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.q = nn.Linear(d_model,d_model, bias = False)
        self.k = nn.Linear(d_model, d_model, bias = False)
        self.v = nn.Linear(d_model, d_model, bias = False)
        self.outputprojectionlayer = nn.Linear(d_model,d_model)
        self.dropout = nn.Dropout(dropout)


    def forward(self,w, mask):
        '''x: Input data after performing positional and word embeddings and adding both
           att_scores : output of same shape
           d_model shpuld be divisible by number of heads
        '''
        q = self.q(w[0])
        k = self.k(w[1])
        v = self.v(w[2])

        
        bs = q.shape[0]
        seq_len1 = w[0].shape[1]
        seq_len2 = w[1].shape[1]
        q = q.view(bs, seq_len1, self.heads, self.d_k).transpose(1, 2)
        k = k.view(bs, seq_len2, self.heads, self.d_k).transpose(1, 2)
        v = v.view(bs, seq_len2, self.heads, self.d_k).transpose(1, 2)
       
        

        att_scores = torch.matmul(q,k.transpose(-2,-1))/math.sqrt(self.d_k)
      
        
        if mask is not None:
            att_scores = att_scores.masked_fill(mask,float('-inf'))
       
        att_scores = F.softmax(att_scores, dim = -1)
        att_scores = self.dropout(att_scores)
        att = torch.matmul(att_scores, v)
        
      
        
        #Concatenating the heads to form the shape of (bs, seq_len, d_model)
        att = att.transpose(1, 2).contiguous().view(bs, seq_len1, self.d_model)
        
        output = self.outputprojectionlayer(att)
        
        return output
        
        
        

In [85]:
class pointwiseffnn(nn.Module):

    def __init__(self, d_model, d_hidden):
        super(pointwiseffnn, self).__init__()
        self.d_hidden = d_hidden
        self.d_model = d_model
        self.linearlayer1 = nn.Linear(d_model, d_hidden)
        self.linearlayer2 = nn.Linear(d_hidden, d_model)
        self.relu = nn.ReLU()
        

    def forward(self, x):
        '''
        This is the pointwise feedforward layer after attention block, which consists of 2 linear layers, a relu
        '''
        x = self.linearlayer1(x)
        x = self.relu(x)
        x = self.linearlayer2(x)
        return x

In [86]:
class Encoderlayer(nn.Module):

    def __init__(self, d_model, h, d_hidden, dropout):
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        '''
        super(Encoderlayer, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.d_hidden = d_hidden
        self.mha = multiheadattention(d_model, h, dropout)
        self.layernorm1 = nn.LayerNorm(self.d_model)
        self.pffnn = pointwiseffnn(d_model, d_hidden)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layernorm2 = nn.LayerNorm(self.d_model)


    def forward(self, x, sam):
            
        x_mha = self.mha([x, x, x], sam)
        x_mha = self.dropout1(x_mha)
        x_out1 = self.layernorm1(x + x_mha)
        
        x_pffnn = self.pffnn(x_out1)
        x_pffnn = self.dropout2(x_pffnn)
        x_out2 = self.layernorm2(x_out1 + x_pffnn)

        return x_out2


In [87]:
'''This function can be used to create multiple encoder blocks with different weights, here we need 6 encoder blocks'''
def multipleblocks(block, N):
    
    return nn.ModuleList([copy.deepcopy(block) for i in range(N)])
        

In [88]:
class Encoder(nn.Module):

    def __init__(self, d_model, h, vocab_size_encoder, N, d_hidden, dropout):
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        pe_encoder : Positional embedding of Encoder input
        
        '''
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.vocab_size_encoder = vocab_size_encoder
        self.N = N
        self.d_hidden = d_hidden
        self.embedding = nn.Embedding(vocab_size_encoder, d_model, padding_idx = 0)
        self.layers = multipleblocks(Encoderlayer(d_model, h, d_hidden, dropout), N)
        
        assert self.d_model % self.heads == 0


    def forward(self, x, sam, pe):
        '''
        x : Encoder input embedded
        sam : self attention padding mask
        '''
        x_wv = self.embedding(x)
        x = x_wv + pe

        for i in range(self.N):
            x = self.layers[i](x, sam)
        return x



In [89]:
class Decoderlayer(nn.Module):

    def __init__(self, d_model, h, d_hidden, dropout):
       
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        '''
        super(Decoderlayer, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_hidden = d_hidden
        self.mha_self = multiheadattention(d_model, h, dropout)
        self.mha_cross = multiheadattention(d_model, h, dropout)
        self.layernorm1 = nn.LayerNorm(self.d_model)
        self.pffnn = pointwiseffnn(d_model, d_hidden)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.layernorm2 = nn.LayerNorm(self.d_model)
        self.layernorm3 = nn.LayerNorm(self.d_model)


    def forward(self, x, y, sam, cam):
        '''
        x : embedded decoder input
        y : encoder output
        sam : Self Attention mask
        cam : cross attention mask
        
        '''
            
        x_mha_self = self.mha_self([x, x, x], sam)
        x_mha_self = self.dropout1(x_mha_self)
        x_out1 = self.layernorm1(x + x_mha_self)
        #print('after self',x_out1.shape)
        
        x_mha_cross = self.mha_cross([x, y, y], cam)
        x_mha_cross = self.dropout2(x_mha_cross)
        x_out2 = self.layernorm2(x_out1 + x_mha_cross)
        
        x_pffnn = self.pffnn(x_out2)
        x_pffnn = self.dropout2(x_pffnn)
        x_out3 = self.layernorm3(x_out2 + x_pffnn)

        return x_out3

In [90]:
class Decoder(nn.Module):

    def __init__(self, d_model, h, vocab_size_decoder, N, d_hidden, dropout):
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        pe_decoder : Positional embedding of Decoder input
        '''
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.vocab_size_decoder = vocab_size_decoder
        self.N = N
        self.d_hidden = d_hidden
        self.embedding = nn.Embedding(vocab_size_decoder, d_model, padding_idx = 0)
        self.layers = multipleblocks(Decoderlayer(d_model, h, d_hidden, dropout), N)
        
        assert self.d_model % self.heads == 0


    def forward(self, x, y, sam, cam, pe):
        '''
        x : decoder input
        y : encoder output
        sam : Self Attention Mask
        cam : Cross Attention Mask
        pe : positional embedding of decoder tokens
        
        '''

        x_wv = self.embedding(x)
        x = x_wv + pe
        for i in range(self.N):
            x = self.layers[i](x, y, sam, cam)
        return x



In [91]:
class Transformer(nn.Module):

    def __init__(self, d_model, heads, vocab_size_encoder, vocab_size_decoder, N, d_hidden, dropout):

        super(Transformer, self).__init__()
        self.d_model = d_model
        self.heads = heads
        self.d_k = d_model//heads
        self.N = N
        self.d_hidden = d_hidden
        self.dropout = dropout
        self.vocab_size_encoder = vocab_size_encoder
        self.vocab_size_decoder = vocab_size_decoder
        self.encoder = Encoder(d_model, heads, vocab_size_encoder, N, d_hidden, dropout)
        self.decoder = Decoder(d_model, heads, vocab_size_decoder, N, d_hidden, dropout)

        self.linear = nn.Linear(d_model, vocab_size_decoder)


    def forward(self, x, y, encoder_sam, decoder_sam, decoder_cam, enc_pe, dec_pe):
        '''
        
        x : encoder input data
        y : decoder input data
        encoding : encoder output data
        decoding : decoder output data
        No need to find the softmax/probability values as it will be already taken care by Cross Entropy loss function.
        '''
       
        encoding = self.encoder(x, encoder_sam, enc_pe)
        decoding = self.decoder(y, encoding, decoder_sam, decoder_cam, dec_pe)

        logits = self.linear(decoding)

        return logits


In [92]:
def save_model(model, optimizer, epoch, filepath):
    """
    Save the model state, optimizer state, epoch, and training history to a file.

    Args:
        model (nn.Module): The PyTorch model to be saved.
        optimizer (torch.optim.Optimizer): The optimizer used in training.
        epoch (int): The current epoch number.
        loss_history (list): List of loss values per epoch.
        accuracy_history (list): List of accuracy values per epoch.
        char_history (list): List of character accuracy values per epoch.
        filepath (str): The path to the file where the state will be saved.
    """
    state = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    torch.save(state, filepath)
    print(f"Model saved to {filepath}")

In [93]:
def load_model(model, optimizer, filepath):
    """
    Load the model state, optimizer state, epoch, and training history from a file.

    Args:
        model (nn.Module): The PyTorch model to be loaded.
        optimizer (torch.optim.Optimizer): The optimizer used in training.
        filepath (str): The path to the file from where the state will be loaded.

    Returns:
        int: The epoch number from which to resume training.
        list: List of loss values per epoch.
        list: List of accuracy values per epoch.
        list: List of character accuracy values per epoch.
    """
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Model loaded from {filepath}")
    return epoch, model



In [249]:
'''
Set the value of hyper parameters before training
'''
d_model = 512
d_hidden = 2048
vocab_size_encoder = 30
vocab_size_decoder = 68
N = 6
heads = 8
dropout = 0.1
seq_len_encoder = 26
seq_len_decoder = 22
learning_rate = 0.0001
epsilon = 1e-8
betas = (0.9, 0.98)
epochs = 10

In [250]:
'''
Creating an instance of transformer, optimizer, loss function
'''
model = Transformer(d_model, heads, vocab_size_encoder, vocab_size_decoder, N, d_hidden, dropout)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=betas, eps=epsilon, weight_decay=0.005)
loss = nn.CrossEntropyLoss()

In [251]:
'''
Assigning the transformer object into multiple GPUs
'''
model = model.to(device)
# model = nn.DataParallel(model)
print(model)

Transformer(
  (encoder): Encoder(
    (embedding): Embedding(30, 512, padding_idx=0)
    (layers): ModuleList(
      (0): Encoderlayer(
        (mha): multiheadattention(
          (q): Linear(in_features=512, out_features=512, bias=False)
          (k): Linear(in_features=512, out_features=512, bias=False)
          (v): Linear(in_features=512, out_features=512, bias=False)
          (outputprojectionlayer): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layernorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (pffnn): pointwiseffnn(
          (linearlayer1): Linear(in_features=512, out_features=2048, bias=True)
          (linearlayer2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (layernorm2): LayerNorm((512,), eps=1e-05, elementwise_affine

In [252]:
def data_loader(x,y,z,batch_size,device=device):
    
    x=x.to(device)
    y=y.to(device)
    z=z.to(device)
    combined=TensorDataset(x,y,z)
    loader=DataLoader(combined,batch_size=batch_size,shuffle=False,drop_last=True)#required in test data
    return loader

In [253]:
'''
Creating data loader
'''
batch_size = 8
train_loader = data_loader(enc_input_data, dec_input_data, dec_output_data, batch_size)
val_loader = data_loader(enc_input_data_val, dec_input_data_val, dec_output_data_val, batch_size)
for i, (x, y, z) in enumerate(train_loader):
    print(x.shape)
    print(y.shape)
    print(z.shape)
    break

torch.Size([8, 26])
torch.Size([8, 22])
torch.Size([8, 22])


In [254]:
def calculate_word_accuracy(dec_predicted_data, dec_output_data):
    
    with torch.no_grad():
        match = (dec_predicted_data == dec_output_data).all(dim=1)
        true_words = match.sum().item()
        batch_size = dec_predicted_data.shape[0]
    
    accuracy = (true_words / batch_size) * 100 
    return accuracy  #Averaged over batch

In [255]:
def calculate_char_accuracy(decoder_predicted_data, decoder_output_data):
   
    batch_size, seq_length = decoder_predicted_data.shape
    
    with torch.no_grad():
        correct_count = (decoder_predicted_data == decoder_output_data).sum().item() 
        return (correct_count / (seq_length * batch_size))*100 #Averaged over batch

In [256]:
enc_pe = positional_embedding(seq_len_encoder, d_model).to(device)
dec_pe = positional_embedding(seq_len_decoder, d_model).to(device)
print(enc_pe.shape)
print(dec_pe.shape)

torch.Size([1, 26, 512])
torch.Size([1, 22, 512])


In [257]:
def test(model, test_loader, heads, batch_size, d_model, enc_pe, dec_pe, total_samples, device):
    '''
    Perform decoder predictions autoregressively
    '''
    model.eval()
    word_acc_test = 0.0
    char_acc_test = 0.0
    predictions = torch.zeros((total_samples, dec_pe.shape[1]+1), device=device)
    predictions[:, 0] = 1.0  # Assuming 1.0 is the index for <SOS>

    word_acc= 0.0
    char_acc = 0.0

    for i, (x, y, z) in enumerate(tqdm(test_loader)):
        x, y, z = x.to(device), y.to(device), z.to(device)
        esm = mask_generator(x, heads, type='encoder_selfattention').to(device)
        encoding = model.encoder(x, esm, enc_pe).to(device)

        for j in range(1, predictions.shape[1]):
            
            dcm = mask_generator(x, heads, j, type='decoder_crossattention').to(device)
            inp = predictions[i * batch_size:(i + 1) * batch_size, :j].to(device)
            preds = model.decoder(inp.long(), encoding, None, None, dec_pe[:, :j])

            preds = model.linear(preds)  # Assuming the linear layer is the output layer
            last_token = preds[:, -1, :]  # Take the last token's predictions
            last_token = F.softmax(last_token, dim=-1)
            last_token = torch.argmax(last_token, dim=-1)
            predictions[i * batch_size:(i + 1) * batch_size, j] = last_token
        
        
        word_acc_val = calculate_word_accuracy(predictions[i*batch_size:(i+1)*batch_size, 1:], z)
        char_acc_val = calculate_char_accuracy(predictions[i*batch_size:(i+1)*batch_size, 1:], z)
        word_acc += word_acc_val
        char_acc += char_acc_val
    word_acc = word_acc/ len(test_loader)
    char_acc = char_acc/ len(test_loader)
    return predictions[:,1:], word_acc, char_acc
            

In [258]:
def train(model, optimizer, criterion, train_loader, val_loader, epochs, heads, batch_size, d_model, 
          enc_pe, dec_pe, device):

    if os.path.exists('model_checkpoint.pth'):
        _, model = load_model(model, optimizer, 'model_checkpoint.pth')
        
    loss_history = []
    train_word_acc_history = []
    train_char_acc_history = []
    val_word_acc_history = []
    val_char_acc_history = []

    for epoch in range(epochs):
        
        model.train()
        epoch_loss = 0.0
        epoch_word_acc_train = 0.0
        epoch_char_acc_train = 0.0
        
        for i, (x, y, z) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            x, y, z = x.to(device), y.to(device), z.to(device)

            esm = mask_generator(x, heads, type='encoder_selfattention').to(device)
            dsm = mask_generator(y, heads, type='decoder_selfattention').to(device)
            dcm = mask_generator(x, heads, y.shape[1], type='decoder_crossattention').to(device)

            preds = model(x, y, esm, dsm, dcm, enc_pe, dec_pe)
           
            optimizer.zero_grad()
            loss = criterion(preds.contiguous().view(-1, preds.shape[-1]), z.contiguous().view(-1).long())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            
            preds = torch.argmax(F.softmax(preds, dim=-1), dim=-1)

            word_acc_train = calculate_word_accuracy(preds, z)
            char_acc_train = calculate_char_accuracy(preds, z)
         
            epoch_word_acc_train += word_acc_train
            epoch_char_acc_train += char_acc_train

        loss_history.append(epoch_loss / len(train_loader)) #Average batch loss
        train_word_acc_history.append(epoch_word_acc_train / len(train_loader))
        train_char_acc_history.append(epoch_char_acc_train / len(train_loader))

        '''Validation'''
        
        model.eval()
        with torch.no_grad():
            
            predictions, epoch_word_acc_val, epoch_char_acc_val = test(model, val_loader, heads, batch_size, 
                                                               d_model, enc_pe, dec_pe, total_samples = 4096, device = device)

        val_word_acc_history.append(epoch_word_acc_val)
        val_char_acc_history.append(epoch_char_acc_val)
            
        print(f"Epoch: {epoch+1}, Loss: {epoch_loss / (i + 1):.4f}, Train Word Accuracy: {epoch_word_acc_train / (i + 1):.4f}, Train Char Accuracy: {epoch_char_acc_train / (i + 1):.4f}, Val Word Accuracy: {epoch_word_acc_val:.4f}, Val Char Accuracy: {epoch_char_acc_val:.4f}")
        
        save_model(model, optimizer, epoch, 'model_checkpoint.pth')
    
    return loss_history, train_word_acc_history, train_char_acc_history, val_word_acc_history, val_char_acc_history, predictions, preds
            

In [259]:
 loss_history, train_word_acc_history, train_char_acc_history, val_word_acc_history, val_char_acc_history, test_predictions, preds = train(model, 
                optimizer, loss, train_loader, val_loader, epochs, heads, batch_size, d_model, enc_pe, dec_pe, device = device)

Epoch 1/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:55<00:00, 27.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:48<00:00, 10.45it/s]


Epoch: 1, Loss: 0.3241, Train Word Accuracy: 15.4004, Train Char Accuracy: 89.6737, Val Word Accuracy: 9.0576, Val Char Accuracy: 76.1819
Model saved to model_checkpoint.pth


Epoch 2/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:50<00:00, 27.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:50<00:00, 10.05it/s]


Epoch: 2, Loss: 0.1775, Train Word Accuracy: 26.7422, Train Char Accuracy: 93.8968, Val Word Accuracy: 9.9854, Val Char Accuracy: 79.5521
Model saved to model_checkpoint.pth


Epoch 3/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:49<00:00, 27.86it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:48<00:00, 10.63it/s]


Epoch: 3, Loss: 0.1546, Train Word Accuracy: 30.8516, Train Char Accuracy: 94.6597, Val Word Accuracy: 15.9424, Val Char Accuracy: 81.5063
Model saved to model_checkpoint.pth


Epoch 4/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:47<00:00, 28.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:48<00:00, 10.49it/s]


Epoch: 4, Loss: 0.1428, Train Word Accuracy: 33.8770, Train Char Accuracy: 95.0737, Val Word Accuracy: 13.9893, Val Char Accuracy: 79.5443
Model saved to model_checkpoint.pth


Epoch 5/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:50<00:00, 27.78it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:51<00:00,  9.93it/s]


Epoch: 5, Loss: 0.1343, Train Word Accuracy: 35.8438, Train Char Accuracy: 95.3427, Val Word Accuracy: 16.3574, Val Char Accuracy: 81.3310
Model saved to model_checkpoint.pth


Epoch 6/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:49<00:00, 27.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:49<00:00, 10.38it/s]


Epoch: 6, Loss: 0.1287, Train Word Accuracy: 37.5020, Train Char Accuracy: 95.5445, Val Word Accuracy: 16.3574, Val Char Accuracy: 82.0634
Model saved to model_checkpoint.pth


Epoch 7/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:50<00:00, 27.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:49<00:00, 10.34it/s]


Epoch: 7, Loss: 0.1235, Train Word Accuracy: 38.9805, Train Char Accuracy: 95.7157, Val Word Accuracy: 17.2852, Val Char Accuracy: 81.8703
Model saved to model_checkpoint.pth


Epoch 8/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:50<00:00, 27.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:49<00:00, 10.41it/s]


Epoch: 8, Loss: 0.1194, Train Word Accuracy: 39.9648, Train Char Accuracy: 95.8481, Val Word Accuracy: 13.4766, Val Char Accuracy: 81.3354
Model saved to model_checkpoint.pth


Epoch 9/10: 100%|███████████████████████████████████████████████████████████████████| 6400/6400 [03:50<00:00, 27.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:51<00:00, 10.00it/s]


Epoch: 9, Loss: 0.1156, Train Word Accuracy: 41.5312, Train Char Accuracy: 95.9930, Val Word Accuracy: 11.8408, Val Char Accuracy: 81.1668
Model saved to model_checkpoint.pth


Epoch 10/10: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [03:51<00:00, 27.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:52<00:00,  9.71it/s]


Epoch: 10, Loss: 0.1124, Train Word Accuracy: 42.3457, Train Char Accuracy: 96.0977, Val Word Accuracy: 15.1855, Val Char Accuracy: 82.9790
Model saved to model_checkpoint.pth


In [219]:
test_predictions[0:5]

tensor([[26., 63., 51., 55., 48., 55., 46.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [41., 26., 55., 10.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [51.,  5., 22., 30., 38., 55.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [52., 55., 10., 48., 55., 38.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [38., 57., 46., 21., 56., 31., 53., 57.,  2.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], device='cuda:0')

In [220]:
dec_output_data[:5]

tensor([[49., 51., 67., 34., 67., 45., 55., 21., 55., 45.,  2.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [41., 56., 38., 67., 36., 67., 44., 55.,  2.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [19., 56., 45., 33., 19., 55.,  5., 34.,  2.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [44., 26., 67., 28., 65., 39., 48., 57., 34.,  2.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [45., 29., 55., 38., 56., 44., 55.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [None]:
dec_output_data.shape

In [164]:
  _, model = load_model(model, optimizer, 'model_checkpoint.pth')

Model loaded from model_checkpoint.pth


In [179]:
predictions, epoch_word_acc_val, epoch_char_acc_val = test(model, val_loader, heads, batch_size, 
                                                               d_model, enc_pe, dec_pe, total_samples = 4096, device = device)

100%|█████████████████████████████████████████████████████████████████████████████████| 512/512 [00:42<00:00, 12.07it/s]


In [180]:
epoch_word_acc_val

20.80078125

In [181]:
epoch_char_acc_val

80.26899857954547