In [1]:
'''This jupyter notebook contains:
1. Data preprocessing
2. Transformer Encoder
3. Transformer Decoder
4. Training
5. Evaluation'''

'This jupyter notebook contains:\n1. Data preprocessing\n2. Transformer Encoder\n3. Transformer Decoder\n4. Training\n5. Evaluation'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import sys
import random
from torch.utils.data import TensorDataset
from tqdm import tqdm
import copy
import math
import time

In [3]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [4]:
'''function to load the data'''
def load_data(path,language_names):
    df=pd.read_csv(path,header=None)
    df.columns=language_names
    return df

In [5]:
'''Here,basically,the input is given to the encoder in English language and is transliterated to Hindi
by the decoder'''
path_train="hin_train.csv"
language_names = ['English','transliteration_in_hindi']
df_train=load_data(path_train,language_names)
print(df_train.shape)
df_train

(51200, 2)


Unnamed: 0,English,transliteration_in_hindi
0,shastragaar,शस्त्रागार
1,bindhya,बिन्द्या
2,kirankant,किरणकांत
3,yagyopaveet,यज्ञोपवीत
4,ratania,रटानिया
...,...,...
51195,toned,टोंड
51196,mutanaazaa,मुतनाज़ा
51197,asahmaton,असहमतों
51198,sulgaayin,सुलगायीं


In [6]:
path_test="hin_test.csv"
path_validation="hin_valid.csv"
df_validation=load_data(path_validation,language_names)
print('Validation data:',df_validation.shape)
df_test=load_data(path_test,language_names)
print('Test data:', df_test.shape)

Validation data: (4096, 2)
Test data: (4096, 2)


In [7]:
'''Function for acquiring all the characters of the given data'''
def split_words(x):
    x=np.array(x)
    alpha=['_','\t','\n',' '] #pad token, start of word, end of word and unknown tokens respectively
    b=[]
    for i in range(x.shape[0]):
        a=list(x[i])
        for j in range(len(a)):
            if a[j] not in b:
                b.append(a[j])
    b=sorted(b)
    alpha=alpha+b
    return alpha

In [8]:
'''All the english characters are stored into the list english_vocab and all the hindi characters are
stored into the list hindi_vocab'''
english_vocab=split_words(df_train['English'])
hindi_vocab=split_words(df_train['transliteration_in_hindi'])
print('English vocabulary size:', len(english_vocab))
print('Hindi vocabulary size:', len(hindi_vocab))
print(english_vocab)
print(hindi_vocab)

English vocabulary size: 30
Hindi vocabulary size: 68
['_', '\t', '\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['_', '\t', '\n', ' ', 'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्']


In [9]:
'''Functions to create the vocabulary dictionaries with their corresponding indices'''
def int_to_char(vocab):
    int2char={} #padding token, start of word, end of word token and unknown token
    for i in range(len(vocab)):
        int2char[i]=vocab[i]
    return int2char

In [10]:
int2char_eng=int_to_char(english_vocab)
print(int2char_eng)

{0: '_', 1: '\t', 2: '\n', 3: ' ', 4: 'a', 5: 'b', 6: 'c', 7: 'd', 8: 'e', 9: 'f', 10: 'g', 11: 'h', 12: 'i', 13: 'j', 14: 'k', 15: 'l', 16: 'm', 17: 'n', 18: 'o', 19: 'p', 20: 'q', 21: 'r', 22: 's', 23: 't', 24: 'u', 25: 'v', 26: 'w', 27: 'x', 28: 'y', 29: 'z'}


In [11]:
def char_to_int(int2char):
    char2int={ch:ii for ii,ch in int2char.items()}
    return char2int

In [12]:
char2int_eng=char_to_int(int2char_eng)
print(char2int_eng)

{'_': 0, '\t': 1, '\n': 2, ' ': 3, 'a': 4, 'b': 5, 'c': 6, 'd': 7, 'e': 8, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'j': 13, 'k': 14, 'l': 15, 'm': 16, 'n': 17, 'o': 18, 'p': 19, 'q': 20, 'r': 21, 's': 22, 't': 23, 'u': 24, 'v': 25, 'w': 26, 'x': 27, 'y': 28, 'z': 29}


In [13]:
int2char_hin=int_to_char(hindi_vocab)
print(int2char_hin)

{0: '_', 1: '\t', 2: '\n', 3: ' ', 4: 'ँ', 5: 'ं', 6: 'ः', 7: 'अ', 8: 'आ', 9: 'इ', 10: 'ई', 11: 'उ', 12: 'ऊ', 13: 'ऋ', 14: 'ए', 15: 'ऐ', 16: 'ऑ', 17: 'ओ', 18: 'औ', 19: 'क', 20: 'ख', 21: 'ग', 22: 'घ', 23: 'ङ', 24: 'च', 25: 'छ', 26: 'ज', 27: 'झ', 28: 'ञ', 29: 'ट', 30: 'ठ', 31: 'ड', 32: 'ढ', 33: 'ण', 34: 'त', 35: 'थ', 36: 'द', 37: 'ध', 38: 'न', 39: 'प', 40: 'फ', 41: 'ब', 42: 'भ', 43: 'म', 44: 'य', 45: 'र', 46: 'ल', 47: 'ळ', 48: 'व', 49: 'श', 50: 'ष', 51: 'स', 52: 'ह', 53: '़', 54: 'ऽ', 55: 'ा', 56: 'ि', 57: 'ी', 58: 'ु', 59: 'ू', 60: 'ृ', 61: 'ॅ', 62: 'े', 63: 'ै', 64: 'ॉ', 65: 'ो', 66: 'ौ', 67: '्'}


In [14]:
char2int_hin=char_to_int(int2char_hin)
print(char2int_hin)

{'_': 0, '\t': 1, '\n': 2, ' ': 3, 'ँ': 4, 'ं': 5, 'ः': 6, 'अ': 7, 'आ': 8, 'इ': 9, 'ई': 10, 'उ': 11, 'ऊ': 12, 'ऋ': 13, 'ए': 14, 'ऐ': 15, 'ऑ': 16, 'ओ': 17, 'औ': 18, 'क': 19, 'ख': 20, 'ग': 21, 'घ': 22, 'ङ': 23, 'च': 24, 'छ': 25, 'ज': 26, 'झ': 27, 'ञ': 28, 'ट': 29, 'ठ': 30, 'ड': 31, 'ढ': 32, 'ण': 33, 'त': 34, 'थ': 35, 'द': 36, 'ध': 37, 'न': 38, 'प': 39, 'फ': 40, 'ब': 41, 'भ': 42, 'म': 43, 'य': 44, 'र': 45, 'ल': 46, 'ळ': 47, 'व': 48, 'श': 49, 'ष': 50, 'स': 51, 'ह': 52, '़': 53, 'ऽ': 54, 'ा': 55, 'ि': 56, 'ी': 57, 'ु': 58, 'ू': 59, 'ृ': 60, 'ॅ': 61, 'े': 62, 'ै': 63, 'ॉ': 64, 'ो': 65, 'ौ': 66, '्': 67}


In [15]:
'''Finding the maximum sequence length'''
length_eng=[len(i) for i in df_train['English']]
length_hin=[len(i) for i in df_train['transliteration_in_hindi']]

In [16]:
length_eng_max=max(length_eng)+2 #we have to account for the start and end token
print(f'The maximum sequence length of English words is {length_eng_max}')
length_hin_max=max(length_hin)+2
print(f'The maximum sequence length of transliterated words is {length_hin_max}')

The maximum sequence length of English words is 26
The maximum sequence length of transliterated words is 22


In [17]:
def process_data(df,english_vocab=english_vocab,hindi_vocab=hindi_vocab,
                 length_eng_max=length_eng_max,length_hin_max=length_hin_max,char2int_eng=char2int_eng
                 ,char2int_hin=char2int_hin):
    
    '''removing words of length more than max length'''
    df['English'] = df['English'].str.lower()
    df['transliteration_in_hindi'] = df['transliteration_in_hindi'].str.lower()
    df = df[df['English'].apply(len) <= length_eng_max-2]
    df = df[df['transliteration_in_hindi'].apply(len) <= length_hin_max-2]
    
    '''Adding start and end of word tokens'''
    y_og = df['transliteration_in_hindi'].values #The data type of y_og will be numpy array
    x_og = df['English'].values
    x = '\t'+x_og+'\n'
    y = '\t'+y_og+'\n'
    y_do=y_og+'\n' #This is for the decoder output
    unknown=3
    pad=0
    pad_char='_'
    unknown_char=' '
    start=1
    end=2
    
    enc_input_data=torch.zeros(len(x),length_eng_max)
    dec_input_data=torch.zeros(len(y),length_hin_max)
    dec_output_data=torch.zeros(len(y),length_hin_max)
    for i, (xx,yy) in enumerate(zip(x,y)):
        for j,char in enumerate(xx):
            enc_input_data[i,j]=char2int_eng[char]
            
        #pad character is zero so no need of assigning it again
        for j,char in enumerate(yy):
            if char in hindi_vocab:
                dec_input_data[i,j]=char2int_hin[char]
            else:
                dec_input_data[i,j]=char2int_hin[unknown_char] #There are chances that unknown char would come in the test data
    
    for i, (xx,yy) in enumerate(zip(x,y_do)):
        for j,char in enumerate(yy):
            if char in hindi_vocab:
                dec_output_data[i,j]=char2int_hin[char]
            else:
                dec_input_data[i,j]=char2int_hin[unknown_char]
                
    return enc_input_data,dec_input_data,dec_output_data

In [18]:
enc_input_data,dec_input_data,dec_output_data=process_data(df_train)

In [19]:
print(enc_input_data.shape)
print(dec_input_data.shape)
print(dec_output_data.shape)

torch.Size([51200, 26])
torch.Size([51200, 22])
torch.Size([51200, 22])


In [20]:
len(english_vocab)

30

In [21]:
def one_hot_encoding(df,english_vocab=english_vocab,hindi_vocab=hindi_vocab,
                 length_eng_max=length_eng_max,length_hin_max=length_hin_max,char2int_eng=char2int_eng
                 ,char2int_hin=char2int_hin):
    
    
    '''removing words of length more than max length'''
    df = df[df['English'].apply(len) <= length_eng_max-2]
    df = df[df['transliteration_in_hindi'].apply(len) <= length_hin_max-2]
    '''Adding start and end of word tokens'''
    y = df['transliteration_in_hindi'].values
    x= df['English'].values
    x = '\t'+x+'\n'
    y = '\t'+y+'\n'
    
    unknown=3
    pad=0
    pad_char='_'
    unknown_char=' '
    start=1
    end=2
    num_english_tokens = len(english_vocab)
    num_hindi_tokens = len(hindi_vocab)
    
    encoder_input_data = np.zeros(
    (len(df['English']), length_eng_max, num_english_tokens), dtype="float32")
    decoder_input_data = np.zeros(
    (len(df['transliteration_in_hindi']), length_hin_max, num_hindi_tokens), dtype="float32")
    decoder_output_data = np.zeros(
    (len(df['transliteration_in_hindi']), length_hin_max, num_hindi_tokens), dtype="float32")
   
    for i , (input_text,target_text) in enumerate(zip(x,y)):
        for t,char in enumerate(input_text):
            encoder_input_data[i,t,char2int_eng[char]]=1
        encoder_input_data[i,t+1:,char2int_eng[pad_char]]=1
    
        for t,char in enumerate(target_text):
            if char in hindi_vocab:
                decoder_input_data[i,t,char2int_hin[char]]=1
            else:
                decoder_input_data[i,t,char2int_hin[unknown_char]]=1
        decoder_input_data[i,t+1:,char2int_hin[pad_char]]=1
    
        '''decoder target data is one step ahead of decoder input data by one timestep
        and doesnot includes start token'''
        for t,char in enumerate(target_text):
            if t>0:
                if char in hindi_vocab:
                    decoder_output_data[i,t-1,char2int_hin[char]]=1
                else:
                    decoder_output_data[i,t-1,char2int_hin[unknown_char]]=1
                
        decoder_output_data[i,t:,char2int_hin[pad_char]]=1
    
    return torch.tensor(encoder_input_data),torch.tensor(decoder_input_data),torch.tensor(decoder_output_data)
    
    

In [22]:
encoder_input_data,decoder_input_data,decoder_output_data=one_hot_encoding(df_train)

In [23]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_output_data.shape)

torch.Size([51200, 26, 30])
torch.Size([51200, 22, 68])
torch.Size([51200, 22, 68])


In [24]:
enc_input_data_test,dec_input_data_test,dec_output_data_test=process_data(df_test)
enc_input_data_val,dec_input_data_val,dec_output_data_val=process_data(df_validation)
encoder_input_data_test,decoder_input_data_test,decoder_output_data_test=one_hot_encoding(df_test)
encoder_input_data_val,decoder_input_data_val,decoder_output_data_val=one_hot_encoding(df_validation)

In [25]:
print(encoder_input_data_val.shape)
print(decoder_input_data_val.shape)
print(decoder_output_data_val.shape)

torch.Size([4096, 26, 30])
torch.Size([4096, 22, 68])
torch.Size([4096, 22, 68])


In [26]:
print(torch.argmax(decoder_input_data,dim=-1))
print(dec_input_data)

tensor([[ 1, 49, 51,  ...,  0,  0,  0],
        [ 1, 41, 56,  ...,  0,  0,  0],
        [ 1, 19, 56,  ...,  0,  0,  0],
        ...,
        [ 1,  7, 51,  ...,  0,  0,  0],
        [ 1, 51, 58,  ...,  0,  0,  0],
        [ 1,  7,  5,  ...,  0,  0,  0]])
tensor([[ 1., 49., 51.,  ...,  0.,  0.,  0.],
        [ 1., 41., 56.,  ...,  0.,  0.,  0.],
        [ 1., 19., 56.,  ...,  0.,  0.,  0.],
        ...,
        [ 1.,  7., 51.,  ...,  0.,  0.,  0.],
        [ 1., 51., 58.,  ...,  0.,  0.,  0.],
        [ 1.,  7.,  5.,  ...,  0.,  0.,  0.]])


In [27]:
enc_input_data=enc_input_data.long()
dec_input_data=dec_input_data.long()
enc_input_data_test=enc_input_data_test.long()
dec_input_data_test=dec_input_data_test.long()
enc_input_data_val=enc_input_data_val.long()
dec_input_data_val=dec_input_data_val.long()
encoder_input_data=encoder_input_data.long()
decoder_input_data=decoder_input_data.long()
decoder_output_data=decoder_output_data.long()

In [28]:
def data_loader(x,y,z,batch_size,device=device):
    
    x=x.to(device)
    y=y.to(device)
    z=z.to(device)
    combined=TensorDataset(x,y,z)
    loader=DataLoader(combined,batch_size=batch_size,shuffle=False,drop_last=True)#required in test data
    return loader

In [29]:
'''
Here, the implementation is based on the paper: Attention is all you need
The original base attention transformer has the following structure:

Main dimension of model-embeddings (d_model): 512
Number of attention heads: 8
Number of encoder layers: 6
Number of decoder layers: 6
Hidden dimension of feed-forward layers: 2048
Dropout probability: 0.1
'''

'\nHere, the implementation is based on the paper: Attention is all you need\nThe original base attention transformer has the following structure:\n\nMain dimension of model-embeddings (d_model): 512\nNumber of attention heads: 8\nNumber of encoder layers: 6\nNumber of decoder layers: 6\nHidden dimension of feed-forward layers: 2048\nDropout probability: 0.1\n'

In [30]:
def mask_generator(x, heads, seq_len_dec = None, type = 'encoder_selfattention'):

    '''
    Here, the output will be of the format (bs, heads, seq_len, seq_len), which can be used for masking in the multihead attention block
    before applying softmax function
    types : Encoder self attention(bs, seq_len, enc_seqlen, enc_seqlen)
            Decoder self attention(bs, seq_len, dec_seqlen, dec_seqlen)
            Decoder cross attention(bs, seq_len, dec_seqlen, enc_seqlen)
    '''
    
    if type == 'encoder_selfattention':
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        pad_idx = 0
        mask = (x == pad_idx)
        '''
        Expand the mask for attention (batch_size, num_heads, seq_len, seq_len); after doing the operation q.kT the shape will be this
        '''
        mask = mask.unsqueeze(1).unsqueeze(2).expand(batch_size, heads, seq_len, seq_len).detach().to(device)
        
    elif type == 'decoder_selfattention':
        '''Here, this masking is used in decoder to avoid the present tokens seeing the future ones'''
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        mask = torch.triu(torch.ones(seq_len,seq_len)*float('-inf'), diagonal = 1).bool().expand(batch_size, heads, seq_len, seq_len).detach().to(device)

    elif type  == 'decoder_crossattention':
        '''Here, we use the padding_mask of encoder and the sequence length should be that of encoder
        output : (batch size, heads, decoder sequence length, encoder sequence langth'''
        batch_size = x.shape[0]
        seq_len_enc = x.shape[1]
        pad_idx = 0
        mask = (x == pad_idx)
        mask = mask.unsqueeze(1).unsqueeze(2).expand(batch_size, heads, seq_len_dec, seq_len_enc).detach().to(device)
        
    return mask


In [31]:
def positional_embedding(batch_size, seq_length, d_model, device):
    '''
    x : (batch_size, seq_length)
    embed : (batch_size, seq_length, d_model)
    Gradient won't flow through positional embedding
    '''
    embed = torch.zeros((batch_size, seq_length, d_model), device=device, requires_grad=False)
    
    pos = torch.arange(seq_length, dtype=torch.float, device=device).unsqueeze(1)
    div = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float, device=device) * (-math.log(10000.0) / d_model))
    
    embed[:, :, 0::2] = torch.sin(pos * div)
    embed[:, :, 1::2] = torch.cos(pos * div)

    return embed



In [32]:
class multiheadattention(nn.Module):
    
    def __init__(self,d_model, h, dropout):
        
        super(multiheadattention, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.q = nn.Linear(d_model,d_model, bias = False)
        self.k = nn.Linear(d_model, d_model, bias = False)
        self.v = nn.Linear(d_model, d_model, bias = False)
        self.outputprojectionlayer = nn.Linear(d_model,d_model)
        self.dropout = nn.Dropout(dropout)


    def forward(self,w, mask):
        '''x: Input data after performing positional and word embeddings and adding both
           att_scores : output of same shape
           d_model shpuld be divisible by number of heads
        '''
        q = self.q(w[0])
        k = self.k(w[1])
        v = self.v(w[2])

        bs = q.shape[0]
        seq_len1 = w[0].shape[1]
        seq_len2 = w[1].shape[1]
        q = q.view(bs, seq_len1, self.heads, self.d_k).transpose(1, 2)
        k = k.view(bs, seq_len2, self.heads, self.d_k).transpose(1, 2)
        v = v.view(bs, seq_len2, self.heads, self.d_k).transpose(1, 2)

        att_scores = torch.matmul(q,k.transpose(-2,-1))/math.sqrt(self.d_k)
        
        if mask is not None:
            att_scores = att_scores.masked_fill(mask,float('-inf'))
            
        att_scores = F.softmax(att_scores, dim = -1)
        #att_scores = self.dropout(att_scores)
        att = torch.matmul(att_scores, v)
        
        #Concatenating the heads to form the shape of (bs, seq_len, d_model)
        att = att.transpose(1, 2).contiguous().view(bs, seq_len1, self.d_model)
        
        output = self.outputprojectionlayer(att)
        
        return output
        
        
        

In [33]:
class pointwiseffnn(nn.Module):

    def __init__(self, d_model, d_hidden):
        super(pointwiseffnn, self).__init__()
        self.d_hidden = d_hidden
        self.d_model = d_model
        self.linearlayer1 = nn.Linear(d_model, d_hidden)
        self.linearlayer2 = nn.Linear(d_hidden, d_model)
        self.relu = nn.ReLU()
        

    def forward(self, x):
        '''
        This is the pointwise feedforward layer after attention block, which consists of 2 linear layers, a relu
        '''
        x = self.linearlayer1(x)
        x = self.relu(x)
        x = self.linearlayer2(x)
        return x

In [34]:
class Encoderlayer(nn.Module):

    def __init__(self, d_model, h, d_hidden, dropout):
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        '''
        super(Encoderlayer, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.d_hidden = d_hidden
        self.mha = multiheadattention(d_model, h, dropout)
        self.layernorm1 = nn.LayerNorm(self.d_model)
        self.pffnn = pointwiseffnn(d_model, d_hidden)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layernorm2 = nn.LayerNorm(self.d_model)


    def forward(self, x, sam):
            
        x_mha = self.mha([x, x, x], sam)
        x_mha = self.dropout1(x_mha)
        x_out1 = self.layernorm1(x + x_mha)
        
        x_pffnn = self.pffnn(x_out1)
        x_pffnn = self.dropout2(x_pffnn)
        x_out2 = self.layernorm2(x_out1 + x_pffnn)

        return x_out2


In [35]:
'''This function can be used to create multiple encoder blocks with different weights, here we need 6 encoder blocks'''
def multipleblocks(block, N):
    
    return nn.ModuleList([copy.deepcopy(block) for i in range(N)])
        

In [36]:
class Encoder(nn.Module):

    def __init__(self, d_model, h, vocab_size_encoder, N, d_hidden, dropout):
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        pe_encoder : Positional embedding of Encoder input
        
        '''
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.vocab_size_encoder = vocab_size_encoder
        self.N = N
        self.d_hidden = d_hidden
        self.embedding = nn.Embedding(vocab_size_encoder, d_model, padding_idx = 0)
        self.layers = multipleblocks(Encoderlayer(d_model, h, d_hidden, dropout), N)
        
        assert self.d_model % self.heads == 0


    def forward(self, x, sam, pe):
        '''
        x : Encoder input embedded
        sam : self attention padding mask
        '''

        x_wv = self.embedding(x)
        x = x_wv + pe
        for i in range(self.N):
            x = self.layers[i](x, sam)
        return x



In [37]:
class Decoderlayer(nn.Module):

    def __init__(self, d_model, h, d_hidden, dropout):
       
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        '''
        super(Decoderlayer, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_hidden = d_hidden
        self.mha_self = multiheadattention(d_model, h, dropout)
        self.mha_cross = multiheadattention(d_model, h, dropout)
        self.layernorm1 = nn.LayerNorm(self.d_model)
        self.pffnn = pointwiseffnn(d_model, d_hidden)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.layernorm2 = nn.LayerNorm(self.d_model)
        self.layernorm3 = nn.LayerNorm(self.d_model)


    def forward(self, x, y, sam, cam):
        '''
        x : embedded decoder input
        y : encoder output
        sam : Self Attention mask
        cam : cross attention mask
        
        '''
            
        x_mha_self = self.mha_self([x, x, x], sam)
        x_mha_self = self.dropout1(x_mha_self)
        x_out1 = self.layernorm1(x + x_mha_self)
        
        x_mha_cross = self.mha_cross([x, y, y], cam)
        x_mha_cross = self.dropout2(x_mha_self)
        x_out2 = self.layernorm2(x_out1 + x_mha_cross)
        
        x_pffnn = self.pffnn(x_out2)
        x_pffnn = self.dropout2(x_pffnn)
        x_out3 = self.layernorm3(x_out2 + x_pffnn)

        return x_out3

In [38]:
class Decoder(nn.Module):

    def __init__(self, d_model, h, vocab_size_decoder, N, d_hidden, dropout):
        '''
        d_model : Embedding dimension(512)
        h : Number of heads(8)
        N : number of attention layers(6)
        d_k : d_model/h
        seq : Maximum sequence length of English(26)
        vocab_size : Number of English characters in the corpus(30)
        epsilon : used for layer normalization (10e-5)
        pe_decoder : Positional embedding of Decoder input
        '''
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.heads = h
        self.d_k = d_model//h
        self.vocab_size_decoder = vocab_size_decoder
        self.N = N
        self.d_hidden = d_hidden
        self.embedding = nn.Embedding(vocab_size_decoder, d_model, padding_idx = 0)
        self.layers = multipleblocks(Decoderlayer(d_model, h, d_hidden, dropout), N)
        
        assert self.d_model % self.heads == 0


    def forward(self, x, y, sam, cam, pe):
        '''
        x : decoder input
        y : encoder output
        sam : Self Attention Mask
        cam : Cross Attention Mask
        pe : positional embedding of decoder tokens
        
        '''

        x_wv = self.embedding(x)
        x = x_wv + pe
        for i in range(self.N):
            x = self.layers[i](x, y, sam, cam)
        return x



In [39]:
class Transformer(nn.Module):

    def __init__(self, d_model, heads, vocab_size_encoder, vocab_size_decoder, N, d_hidden, dropout):

        super(Transformer, self).__init__()
        self.d_model = d_model
        self.heads = heads
        self.d_k = d_model//heads
        self.N = N
        self.d_hidden = d_hidden
        self.dropout = dropout
        self.vocab_size_encoder = vocab_size_encoder
        self.vocab_size_decoder = vocab_size_decoder
        self.encoder = Encoder(d_model, heads, vocab_size_encoder, N, d_hidden, dropout)
        self.decoder = Decoder(d_model, heads, vocab_size_decoder, N, d_hidden, dropout)

        self.linear = nn.Linear(d_model, vocab_size_decoder)


    def forward(self, x, y, encoder_sam, decoder_sam, decoder_cam, enc_pe, dec_pe):
        '''
        x : encoder input data
        y : decoder input data
        encoding : encoder output data
        decoding : decoder output data
        No need to find the softmax/probability values as it will be already taken care by Cross Entropy loss function.
        '''

        encoding = self.encoder(x, encoder_sam, enc_pe)
        decoding = self.decoder(y, encoding, decoder_sam, decoder_cam, dec_pe)

        logits = self.linear(decoding)

        return logits


In [40]:
'''
Set the value of hyper parameters before training
'''
d_model = 512
d_hidden = 2048
vocab_size_encoder = 30
vocab_size_decoder = 68
N = 6
heads = 8
dropout = 0.1
seq_len_encoder = 26
seq_len_decoder = 22
learning_rate = 0.0001
epsilon = 1e-9
betas = (0.9, 0.98)
epochs = 100

In [54]:
'''
Creating an instance of transformer, optimizer, loss function
'''
model = Transformer(d_model, heads, vocab_size_encoder, vocab_size_decoder, N, d_hidden, dropout)

optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, betas = betas, eps = epsilon)
loss = nn.CrossEntropyLoss()

In [55]:
'''
Assigning the transformer object into multiple GPUs
'''
model = model.to(device)
model = nn.DataParallel(model)
print(model)

DataParallel(
  (module): Transformer(
    (encoder): Encoder(
      (embedding): Embedding(30, 512, padding_idx=0)
      (layers): ModuleList(
        (0): Encoderlayer(
          (mha): multiheadattention(
            (q): Linear(in_features=512, out_features=512, bias=False)
            (k): Linear(in_features=512, out_features=512, bias=False)
            (v): Linear(in_features=512, out_features=512, bias=False)
            (outputprojectionlayer): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (layernorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (pffnn): pointwiseffnn(
            (linearlayer1): Linear(in_features=512, out_features=2048, bias=True)
            (linearlayer2): Linear(in_features=2048, out_features=512, bias=True)
            (relu): ReLU()
          )
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
     

In [56]:
'''
Creating data loader
'''
batch_size = 8
train_loader = data_loader(enc_input_data, dec_input_data, dec_output_data, batch_size)
for i, (x, y, z) in enumerate(train_loader):
    print(x.shape)
    print(y.shape)
    print(z.shape)
    break

torch.Size([8, 26])
torch.Size([8, 22])
torch.Size([8, 22])


In [57]:
def calculate_word_accuracy(dec_predicted_data, dec_output_data):
    
    with torch.no_grad():
        match = (dec_predicted_data == dec_output_data).all(dim=1)
        true_words = match.sum().item()
        batch_size = dec_predicted_data.shape[0]
    
    accuracy = (true_words / batch_size) * 100 #Averaged over batch
    return accuracy


In [58]:
def calculate_char_accuracy(decoder_predicted_data, decoder_output_data):
   
    batch_size, seq_length = decoder_predicted_data.shape
    
    with torch.no_grad():
        correct_count = (decoder_predicted_data == decoder_output_data).sum().item() #Averaged over batch
        return (correct_count / (seq_length * batch_size))*100

In [59]:
enc_pe = positional_embedding(batch_size, seq_len_encoder, d_model, x.device)
dec_pe = positional_embedding(batch_size, seq_len_decoder, d_model, y.device)
print(enc_pe.shape)
print(dec_pe.shape)

torch.Size([8, 26, 512])
torch.Size([8, 22, 512])


In [60]:
def train(model, optimizer, criterion, train_loader, epochs, heads, batch_size, d_model, 
                                                                     enc_pe, dec_pe, device = device):

    model.train()
    loss_history = []
    accuracy_history = []
    char_history = []
    for epoch in range(epochs):
        
        epoch_loss = 0.0
        epoch_acc = 0.0
        epoch_char_acc = 0.0
        for i, (x, y, z) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            '''
            enc_pe : Positional embedding of encoder input(no grad flow)
            dec_pe : Positional embedding of decoder input(no grad flow)
            esm : Encoder self attention mask
            dsm : Decoder self attention mask
            dcm : Decoder cross attention mask
            '''
            esm = mask_generator(x, heads, type = 'encoder_selfattention').to(device)
            dsm = mask_generator(y, heads, type = 'decoder_selfattention').to(device)
            dcm = mask_generator(x, heads, y.shape[1], type = 'decoder_crossattention').to(device)
            
            preds = model(x, y, esm, dsm, dcm, enc_pe, dec_pe)
                        
            optimizer.zero_grad()
            loss = criterion(preds.contiguous().view(-1, preds.shape[-1]), z.contiguous().view(-1).long())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            word_acc = calculate_word_accuracy(torch.argmax(preds, dim = -1), z)
            char_acc = calculate_char_accuracy(torch.argmax(preds, dim = -1), z)
            epoch_acc += word_acc
            epoch_char_acc += char_acc

        loss_history.append(epoch_loss)
        accuracy_history.append(epoch_acc / (i + 1))
        char_history.append(epoch_char_acc / (i + 1))
        print(f"Epoch: {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc / (i + 1):.4f}, Char Accuracy: {epoch_char_acc / (i + 1):.4f}")

    return loss_history, accuracy_history, char_history


In [61]:
loss_history, accuracy_history, char_history = train(model, optimizer, loss, train_loader, epochs, heads, batch_size, d_model, 
                                                                     enc_pe, dec_pe, device = device)

Epoch 1/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:53<00:00,  6.71it/s]


Epoch: 1, Loss: 6949.8813, Accuracy: 0.0000, Char Accuracy: 69.9651


Epoch 2/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.72it/s]


Epoch: 2, Loss: 6472.0356, Accuracy: 0.0000, Char Accuracy: 71.5804


Epoch 3/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 3, Loss: 6280.2604, Accuracy: 0.0000, Char Accuracy: 72.2752


Epoch 4/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 4, Loss: 6135.7575, Accuracy: 0.0000, Char Accuracy: 72.8248


Epoch 5/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 5, Loss: 6013.5035, Accuracy: 0.0000, Char Accuracy: 73.2665


Epoch 6/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 6, Loss: 5906.7635, Accuracy: 0.0000, Char Accuracy: 73.6626


Epoch 7/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 7, Loss: 5809.7378, Accuracy: 0.0000, Char Accuracy: 74.0323


Epoch 8/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:52<00:00,  6.72it/s]


Epoch: 8, Loss: 5710.7725, Accuracy: 0.0000, Char Accuracy: 74.3765


Epoch 9/100: 100%|██████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 9, Loss: 5622.5738, Accuracy: 0.0000, Char Accuracy: 74.7444


Epoch 10/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 10, Loss: 5535.1205, Accuracy: 0.0000, Char Accuracy: 75.0439


Epoch 11/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 11, Loss: 5450.0069, Accuracy: 0.0000, Char Accuracy: 75.3602


Epoch 12/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 12, Loss: 5366.2394, Accuracy: 0.0000, Char Accuracy: 75.6794


Epoch 13/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 13, Loss: 5284.1572, Accuracy: 0.0020, Char Accuracy: 75.9838


Epoch 14/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 14, Loss: 5207.0513, Accuracy: 0.0000, Char Accuracy: 76.2984


Epoch 15/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 15, Loss: 5132.7220, Accuracy: 0.0000, Char Accuracy: 76.5768


Epoch 16/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 16, Loss: 5062.5210, Accuracy: 0.0000, Char Accuracy: 76.8574


Epoch 17/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 17, Loss: 4991.9237, Accuracy: 0.0000, Char Accuracy: 77.1204


Epoch 18/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 18, Loss: 4919.9772, Accuracy: 0.0000, Char Accuracy: 77.3952


Epoch 19/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 19, Loss: 4858.4123, Accuracy: 0.0000, Char Accuracy: 77.6330


Epoch 20/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 20, Loss: 4792.7070, Accuracy: 0.0000, Char Accuracy: 77.8886


Epoch 21/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 21, Loss: 4737.2415, Accuracy: 0.0000, Char Accuracy: 78.1461


Epoch 22/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:52<00:00,  6.72it/s]


Epoch: 22, Loss: 4678.4045, Accuracy: 0.0000, Char Accuracy: 78.3794


Epoch 23/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.73it/s]


Epoch: 23, Loss: 4624.5405, Accuracy: 0.0020, Char Accuracy: 78.5968


Epoch 24/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 24, Loss: 4576.1431, Accuracy: 0.0020, Char Accuracy: 78.7800


Epoch 25/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 25, Loss: 4526.6017, Accuracy: 0.0000, Char Accuracy: 78.9727


Epoch 26/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 26, Loss: 4479.4443, Accuracy: 0.0000, Char Accuracy: 79.1709


Epoch 27/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 27, Loss: 4434.8785, Accuracy: 0.0000, Char Accuracy: 79.3714


Epoch 28/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.74it/s]


Epoch: 28, Loss: 4399.7177, Accuracy: 0.0000, Char Accuracy: 79.5410


Epoch 29/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 29, Loss: 4362.7630, Accuracy: 0.0039, Char Accuracy: 79.6817


Epoch 30/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 30, Loss: 4322.9040, Accuracy: 0.0020, Char Accuracy: 79.8509


Epoch 31/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.74it/s]


Epoch: 31, Loss: 4291.1073, Accuracy: 0.0020, Char Accuracy: 79.9961


Epoch 32/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.74it/s]


Epoch: 32, Loss: 4258.3948, Accuracy: 0.0000, Char Accuracy: 80.1585


Epoch 33/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.74it/s]


Epoch: 33, Loss: 4222.8467, Accuracy: 0.0020, Char Accuracy: 80.2931


Epoch 34/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 34, Loss: 4195.3809, Accuracy: 0.0020, Char Accuracy: 80.4075


Epoch 35/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 35, Loss: 4166.6188, Accuracy: 0.0039, Char Accuracy: 80.5471


Epoch 36/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:50<00:00,  6.73it/s]


Epoch: 36, Loss: 4139.2062, Accuracy: 0.0000, Char Accuracy: 80.6665


Epoch 37/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:49<00:00,  6.74it/s]


Epoch: 37, Loss: 4112.8675, Accuracy: 0.0020, Char Accuracy: 80.7900


Epoch 38/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:51<00:00,  6.72it/s]


Epoch: 38, Loss: 4090.1601, Accuracy: 0.0020, Char Accuracy: 80.9039


Epoch 39/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:54<00:00,  6.71it/s]


Epoch: 39, Loss: 4066.2669, Accuracy: 0.0020, Char Accuracy: 80.9935


Epoch 40/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:53<00:00,  6.71it/s]


Epoch: 40, Loss: 4048.4396, Accuracy: 0.0020, Char Accuracy: 81.0712


Epoch 41/100: 100%|█████████████████████████████████████████████████████████████████| 6400/6400 [15:54<00:00,  6.71it/s]


Epoch: 41, Loss: 4021.8795, Accuracy: 0.0020, Char Accuracy: 81.2072


Epoch 42/100:   5%|███▏                                                              | 308/6400 [00:46<15:14,  6.66it/s]


KeyboardInterrupt: 

In [63]:
def save_model(model, optimizer, epoch, loss_history, accuracy_history, char_history, filepath):
    """
    Save the model state, optimizer state, epoch, and training history to a file.

    Args:
        model (nn.Module): The PyTorch model to be saved.
        optimizer (torch.optim.Optimizer): The optimizer used in training.
        epoch (int): The current epoch number.
        loss_history (list): List of loss values per epoch.
        accuracy_history (list): List of accuracy values per epoch.
        char_history (list): List of character accuracy values per epoch.
        filepath (str): The path to the file where the state will be saved.
    """
    state = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss_history': loss_history,
        'accuracy_history': accuracy_history,
        'char_history': char_history
    }
    torch.save(state, filepath)
    print(f"Model saved to {filepath}")


In [64]:
#save_model(model, optimizer, 41, loss_history, accuracy_history, char_history, 'model_checkpoint.pth')
save_model(model, optimizer, 41, 'model_checkpoint.pth')

NameError: name 'loss_history' is not defined

In [None]:
def load_model(model, optimizer, filepath):
    """
    Load the model state, optimizer state, epoch, and training history from a file.

    Args:
        model (nn.Module): The PyTorch model to be loaded.
        optimizer (torch.optim.Optimizer): The optimizer used in training.
        filepath (str): The path to the file from where the state will be loaded.

    Returns:
        int: The epoch number from which to resume training.
        list: List of loss values per epoch.
        list: List of accuracy values per epoch.
        list: List of character accuracy values per epoch.
    """
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss_history = checkpoint['loss_history']
    accuracy_history = checkpoint['accuracy_history']
    char_history = checkpoint['char_history']
    print(f"Model loaded from {filepath}")
    return epoch, loss_history, accuracy_history, char_history



In [None]:
epoch, loss_history, accuracy_history, char_history = load_model(model, optimizer, 'model_checkpoint.pth')