Basic code for machine translation using Seq2Seq (No Attention Mechanism Used). Can be made more computationally efficient by masking padded tokens. Other possibilities include: Better Initialization, Better Architecture, Better Training.
1. Prepare Data
    1.1 Read data
    1.2 Create normalized pairs (create + normalize (unicode 2 ascii, remove non-letter characters, trim)) (list of lists, each list will be a pair)
    1.3 Filter pairs
    1.4 Build vocab (Write Vocab class, Create Vocab objects for each class, and build vocab)
2. Define Encoder and Decoder
3. Prepare Data and DataLoader
4. Training
5. Evaluation

In [13]:
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split

from io import open
import unicodedata
import re

In [14]:
if torch.cuda.is_available():
    device=torch.device(type='cuda', index=0)
else:
    device=torch.device(type='cpu', index=0)

In [15]:
#unicode 2 ascii, remove non-letter characters, trim
def normalizeString(s): 
    sres=""
    for ch in unicodedata.normalize('NFD', s): 
        #Return the normal form form ('NFD') for the Unicode string s.
        if unicodedata.category(ch) != 'Mn':
            # The function in the first part returns the general 
            # category assigned to the character ch as string. 
            # "Mn' refers to Mark, Nonspacing
            sres+=ch
    sres = re.sub(r"([.!?])", r" \1", sres) 
    # inserts a space before any occurrence of ".", "!", or "?" in the string sres. 
    sres = re.sub(r"[^a-zA-Z!?]+", r" ", sres) 
    # this line of code replaces any sequence of characters in sres 
    # that are not letters (a-z or A-Z) or the punctuation marks 
    # "!" or "?" with a single space character.
    return sres.strip()

#create list of pairs (list of lists) (no filtering)
def createNormalizedPairs():
    initpairs=[]
    for pair in data:
        s1,s2=pair.split('\t')
        s1=normalizeString(s1.lower().strip())
        s2=normalizeString(s2.lower().strip())
        initpairs.append([s1,s2])
#         print("S1 : ",s1)
#         print("S2: ",s2)
    #print(len(initpairs))
    return initpairs

#filter pairs
max_length = 10
def filterPairs(initpairs):
    #filtering conditions in addition to max_length
    eng_prefixes = (
        "i am ", "i m ",
        "he is", "he s ",
        "she is", "she s ",
        "you are", "you re ",
        "we are", "we re ",
        "they are", "they re "
    )

    pairs=[]
    for pair in initpairs:
        if len(pair[0].split(" ")) < max_length and len(pair[1].split(" ")) < max_length and pair[0].lower().startswith(eng_prefixes):
            pairs.append(pair)

    print("Number of pairs after filtering:", len(pairs))
    return pairs #list of lists

In [16]:
class Vocab:
    def __init__(self, name):
        self.name=name
        self.word2index={'SOS':0, 'EOS':1}
        self.index2word={0:'SOS', 1:'EOS'}
        self.word2count={}
        self.nwords=2
    
    def buildVocab(self,s):
        for word in s.split(" "):
            if word not in self.word2index:
                self.word2index[word]=self.nwords
                self.index2word[self.nwords]=word
                self.word2count[word]=1
                self.nwords+=1
            else:
                self.word2count[word]+=1

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, dropout_p=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_size)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)  # Use LSTM instead of GRU
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        outputs, (hidden, cell) = self.lstm(x)  # LSTM returns hidden state and cell state
        return outputs, (hidden, cell)


In [18]:
import torch
import torch.nn as nn

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embed_size)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)  # Use LSTM instead of GRU
        self.linear = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax(dim=-1)
    
    def forward(self, x, prev_hidden):
        x = self.embedding(x)
        x = self.relu(x)
        output, (hidden, cell) = self.lstm(x, prev_hidden)  # LSTM returns hidden state and cell state
        y = self.linear(output)
        y = self.log_softmax(y)
        return y, (hidden, cell)


In [19]:
def get_input_ids(sentence,langobj):
    input_ids=[]
    for word in sentence.split(" "):
        input_ids.append(langobj.word2index[word])
    
    if langobj.name=='fre': #translation-direction sensitive
        input_ids.append(langobj.word2index['EOS'])
    else:
        input_ids.insert(0,langobj.word2index['SOS'])
        input_ids.append(langobj.word2index['EOS'])
    return torch.tensor(input_ids)

In [20]:
class CustomDataset(Dataset):
    def __init__(self):
        super().__init__()
    
    def __len__(self):
        return length
    
    def __getitem__(self,idx):
        t=pairs[idx][0] #translation-direction sensitive --> english is first ind
        s=pairs[idx][1] #translation-direction sensitive
        s_input_ids=torch.zeros(max_length+1, dtype=torch.int64)
        t_input_ids=torch.zeros(max_length+2, dtype=torch.int64)
        s_input_ids[:len(s.split(" "))+1]=get_input_ids(s,fre) #translation-direction sensitive
        t_input_ids[:len(t.split(" "))+2]=get_input_ids(t,eng) #translation-direction sensitive
        
        return s_input_ids, t_input_ids

In [21]:
def train_one_epoch():
    encoder.train()
    decoder.train()
    track_loss=0
    
    for i, (s_ids,t_ids) in enumerate(train_dataloader):
        s_ids=s_ids.to(device)
        t_ids=t_ids.to(device)
        encoder_outputs, encoder_hidden=encoder(s_ids)
        decoder_hidden=encoder_hidden
        yhats, decoder_hidden = decoder(t_ids[:,0:-1],decoder_hidden)
                    
        gt=t_ids[:,1:]
        
        yhats_reshaped=yhats.view(-1,yhats.shape[-1])
        
        gt=gt.reshape(-1)
        
        
        loss=loss_fn(yhats_reshaped,gt)
        track_loss+=loss.item()
        
        opte.zero_grad()
        optd.zero_grad()
        
        loss.backward()
        
        opte.step()
        optd.step()
        
    return track_loss/len(train_dataloader)    

In [22]:
def ids2Sentence(ids,vocab):
    sentence=""
    for id in ids.squeeze():
        if id==0:
            continue
        word=vocab.index2word[id.item()]
        sentence+=word + " "
        if id==1:
            break
    return sentence

In [23]:
#eval loop (written assuming batch_size=1)
def eval_one_epoch():
    encoder.eval()
    decoder.eval()
    track_loss=0
    with torch.no_grad():
        for i, (s_ids,t_ids) in enumerate(test_dataloader):
            s_ids=s_ids.to(device)
            t_ids=t_ids.to(device)
            encoder_outputs, encoder_hidden=encoder(s_ids)
            decoder_hidden=encoder_hidden #n_dim=3
            input_ids=t_ids[:,0]
            yhats=[]
            pred_sentence=""
            for j in range(1,max_length+1): #j starts from 1
                probs, decoder_hidden = decoder(input_ids.unsqueeze(1),decoder_hidden)
                yhats.append(probs)
                _,input_ids=torch.topk(probs,1,dim=-1)
                input_ids=input_ids.squeeze(1,2) #still a tensor
                word=eng.index2word[input_ids.item()] #batch_size=1
                pred_sentence+=word + " "
                if input_ids.item() == 1: #batch_size=1
                    break
                                
            src_sentence=ids2Sentence(s_ids,fre) #translation-direction sensitive
            gt_sentence=ids2Sentence(t_ids,eng) #translation-direction sensitive
            
            print("-----------------------------------")
            print("Source Sentence:",src_sentence)
            print("GT Sentence:",gt_sentence)
            print("Predicted Sentence:",pred_sentence)
            
            yhats_cat=torch.cat(yhats,dim=1)
            yhats_reshaped=yhats_cat.view(-1,yhats_cat.shape[-1])
            gt=t_ids[:,1:j+1]
            gt=gt.view(-1)
#             gt --> ground truth
            

            loss=loss_fn(yhats_reshaped,gt)
            track_loss+=loss.item()
            
            
        print("-----------------------------------")
        return track_loss/len(test_dataloader)    

In [24]:
#read data
data=open("/kaggle/input/eng-fra/eng-fra.txt").read().strip().split('\n')
print("Total number of pairs:",len(data))

#create pairs (create + normalize)
initpairs=createNormalizedPairs() #list of lists. Each inner list is a pair

# Task 1
print("S1 and S2")
print(initpairs[300])


#filter pairs
pairs=filterPairs(initpairs)
length=len(pairs)
print(pairs[300])

#create Vocab objects for each language
eng=Vocab('eng')
fre=Vocab('fre')

#build the vocab
for pair in pairs:
    eng.buildVocab(pair[0])
    fre.buildVocab(pair[1])

#print vocab size
print("English Vocab Length:",eng.nwords)
print("French Vocab Length:",fre.nwords)    
    
dataset=CustomDataset()
train_dataset,test_dataset=random_split(dataset,[0.99,0.01])

print(train_dataset[300])

batch_size=32
train_dataloader=DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=False)
test_dataloader=DataLoader(dataset=test_dataset,batch_size=1, shuffle=False)

    
embed_size=100
hidden_size=128

encoder=Encoder(fre.nwords,embed_size,hidden_size).to(device) #translation-direction sensitive
decoder=Decoder(eng.nwords,embed_size,hidden_size).to(device) #translation-direction sensitive

loss_fn=nn.NLLLoss().to(device)
lr=0.001
opte=optim.Adam(params=encoder.parameters(), lr=lr)
optd=optim.Adam(params=decoder.parameters(), lr=lr)

n_epochs=100

for e in range(n_epochs):
    print("Epoch=",e+1, " Loss=", train_one_epoch(), sep="")

for e in range(1):
    print("Epoch=",e+1, " Loss=", eval_one_epoch(), sep="")

Total number of pairs: 135842
S1 and S2
['i m ugly', 'je suis laid']
Number of pairs after filtering: 11445
['i m all set', 'je suis tout a fait prete']
English Vocab Length: 2991
French Vocab Length: 4601
(tensor([   2,  296,    7,  245,  289, 1149,  498,   10, 3953,    1,    0]), tensor([   0,    2,    3,  146,   43,  157, 2465,  290,    1,    0,    0,    0]))
Epoch=1 Loss=2.552940502636869
Epoch=2 Loss=1.7787637223660107
Epoch=3 Loss=1.5518498612121796
Epoch=4 Loss=1.3953741412767222
Epoch=5 Loss=1.2754920759671171
Epoch=6 Loss=1.1752160000129486
Epoch=7 Loss=1.0883595975352005
Epoch=8 Loss=1.0100452740427475
Epoch=9 Loss=0.9394859766456443
Epoch=10 Loss=0.8762586755651823
Epoch=11 Loss=0.8176159575791426
Epoch=12 Loss=0.7607350636535967
Epoch=13 Loss=0.7101277927277794
Epoch=14 Loss=0.6606718645968908
Epoch=15 Loss=0.6164507577956563
Epoch=16 Loss=0.574978108011501
Epoch=17 Loss=0.537022390499921
Epoch=18 Loss=0.5003775261657338
Epoch=19 Loss=0.4677928564833923
Epoch=20 Loss=0.4370