<a href="https://colab.research.google.com/github/arghyatiger/nlp-projects/blob/master/Seq2Seq_Model_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Seq2Seq Machine Translation Model**




**Loading the Data**
---



In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
f_eng_train = open('./gdrive/My Drive/enghin/train.en', 'r')
f_hin_train = open('./gdrive/My Drive/enghin/train.hi', 'r')
f_eng_dev = open('./gdrive/My Drive/enghin/dev.en', 'r')
f_hin_dev = open('./gdrive/My Drive/enghin/dev.hi', 'r')
f_eng_test = open('./gdrive/My Drive/enghin/test.en', 'r')
f_hin_test = open('./gdrive/My Drive/enghin/test.hi', 'r')
# f_eng_train = open('./enghin/train.en', 'r')
# f_hin_train = open('./enghin/train.hi', 'r')
# f_eng_dev = open('./enghin/dev.en', 'r')
# f_hin_dev = open('./enghin/dev.hi', 'r')
# f_eng_test = open('./enghin/test.en', 'r')
# f_hin_test = open('./enghin/test.hi', 'r')
eng_train = f_eng_train.readlines()
hin_train = f_hin_train.readlines()
eng_dev = f_eng_dev.readlines()
hin_dev = f_hin_dev.readlines()
eng_test = f_eng_test.readlines()
hin_test = f_hin_test.readlines()
eng_train = eng_train[:1024]
hin_train = hin_train[:1024]
eng_test = eng_test[:2]
hin_test = hin_test[:2]


**Insights about the data**
---



1. ***DATASET SIZES:***

> No of Training sentences: 49398

> No of Dev sentences: 401

> No of Test sentences : 200

2. ***DATA SAMPLES:***

> Training:

>>ENG: The treatment of cataract is possible through surgery only .

>>HIN: मोतियाबिंद का उपचार केवल शल्य-चिकित्सा द्वारा ही सम्भव है ।

>Dev:

>>ENG: but you will also be safe from eye diseases .

>>HIN: बल्कि आप नेत्ररोगों से भी बचे रहेंगे ।

>Test:

>>ENG: Fresh breath and shining teeth enhance your personality .

>>HIN: ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं ।

**Pre-Processing Of Data**
---



**Loading Pre-Trained Word Embeddings**
---


In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.data import Field, BucketIterator, TabularDataset
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.vocab import Vectors


In [0]:
eng_vectors = Vectors(name='./wiki-news-300d-1M.vec', cache='./')

In [0]:
hin_vectors = Vectors(name='./cc.hi.300.vec', cache='./')

In [0]:
import pandas as pd



train_data = {'English' : [line for line in eng_train], 'Hindi': [line for line in hin_train]}
test_data = {'English' : [line for line in eng_test], 'Hindi': [line for line in hin_test]}
val_data = {'English' : [line for line in eng_dev], 'Hindi': [line for line in hin_dev]}

df = pd.DataFrame(train_data, columns=["English", "Hindi"])
df_t = pd.DataFrame(test_data, columns=["English", "Hindi"])
df_val = pd.DataFrame(val_data, columns=["English", "Hindi"])

In [0]:
df.to_csv("train.csv")
df_t.to_csv("test.csv")
df_val.to_csv("val.csv")

In [0]:
def tokenize_english(text):
  text = text.replace('\n', ' ')
  return text.split()

def tokenize_hindi(text):
  text = text.replace('\n', ' ')
  return text.split()

In [0]:
BATCH_SIZE = 64
EN_TEXT = Field(tokenize = tokenize_english, init_token = '<sos>', eos_token = '<eos>', lower=True)
HI_TEXT = Field(tokenize = tokenize_hindi, init_token = '<sos>', eos_token = '<eos>')
data_fields = [('English', EN_TEXT), ('Hindi', HI_TEXT)]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
train, val, test = TabularDataset.splits(path='./', train='train.csv', validation='val.csv', test='test.csv', format='csv', fields=data_fields)

In [260]:
print(f"Number of training examples: {len(train.examples)}")
print(f"Number of validation examples: {len(val.examples)}")
print(f"Number of testing examples: {len(test.examples)}")

Number of training examples: 1025
Number of validation examples: 402
Number of testing examples: 3


In [0]:
EN_TEXT.build_vocab(train, vectors=eng_vectors)
HI_TEXT.build_vocab(train, vectors=hin_vectors)

In [0]:
train_iter, val_iter, test_iter = BucketIterator.splits((train, val, test), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.English), device=device)

In [0]:
vocab_english = EN_TEXT.vocab
vocab_hindi = HI_TEXT.vocab

In [264]:
print(len(vocab_english))
print(len(vocab_hindi))

1028
3016


**Neural Network Model**
---



In [0]:
def create_emb_layer(vocab, inp_dim, emb_dim, non_trainable=False):
#     num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(len(vocab), emb_dim)
    emb_layer.weight.data.copy_(vocab.vectors)
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = create_emb_layer(vocab_english, self.input_dim, self.emb_dim, True)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
#         print()
        embedded = self.dropout(self.embedding(src))        
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = create_emb_layer(vocab_hindi, self.output_dim, self.emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        prediction = self.out(output.squeeze(0))
        
        return prediction, hidden, cell

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, max_len):
            
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        
        return outputs

**Training Of Model**
---



In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.English
        trg = batch.Hindi
        
        print(i, src.shape, trg.shape)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [228]:
def get_sent()

SyntaxError: ignored

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.English
            trg = batch.Hindi
            output = model(src, trg, 0)

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            
            print(output.shape, trg.shape)
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
# def evaluate_results(model, source_sentence, target_sentence):
    
#     model.eval()
#     src = to_idx_source(source_sentence)
#     trg = to_idx_target(target_sentence)
#     output = model(src, trg, 0) #turn off teacher forcing
    
        
#     return epoch_loss / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [232]:
INPUT_DIM = len(vocab_english)
OUTPUT_DIM = len(vocab_hindi)
print(INPUT_DIM, OUTPUT_DIM)

1028 3016


In [233]:
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
HID_DIM = 16
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
pad_idx = HI_TEXT.vocab.stoi['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

  "num_layers={}".format(dropout, num_layers))


In [234]:
import os
import time 
import random
import math


N_EPOCHS = 10
CLIP = 1
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'model.pt')

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_iter, criterion)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

0 torch.Size([3, 64]) torch.Size([50, 64])
1 torch.Size([3, 64]) torch.Size([49, 64])
2 torch.Size([3, 64]) torch.Size([50, 64])
3 torch.Size([3, 64]) torch.Size([38, 64])
4 torch.Size([3, 64]) torch.Size([45, 64])
5 torch.Size([3, 64]) torch.Size([44, 64])
6 torch.Size([3, 1]) torch.Size([20, 1])
7 torch.Size([3, 64]) torch.Size([50, 64])
8 torch.Size([3, 64]) torch.Size([44, 64])
9 torch.Size([3, 64]) torch.Size([47, 64])
10 torch.Size([3, 64]) torch.Size([47, 64])
11 torch.Size([3, 64]) torch.Size([40, 64])
12 torch.Size([3, 64]) torch.Size([40, 64])
13 torch.Size([3, 64]) torch.Size([48, 64])
14 torch.Size([3, 64]) torch.Size([45, 64])
15 torch.Size([3, 64]) torch.Size([62, 64])
16 torch.Size([3, 64]) torch.Size([38, 64])
torch.Size([2240, 3016]) torch.Size([2240])
torch.Size([2496, 3016]) torch.Size([2496])
torch.Size([2368, 3016]) torch.Size([2368])
torch.Size([3264, 3016]) torch.Size([3264])
torch.Size([2176, 3016]) torch.Size([2176])
torch.Size([2880, 3016]) torch.Size([2880])


In [226]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))

test_loss = evaluate(model, test_iter, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

torch.Size([30, 3016]) torch.Size([30])
| Test Loss: 4.695 | Test PPL: 109.375 |


In [186]:
len(eng_test)

200