In [1]:
import numpy  as np
import pandas as  pd
import torch 
import torch.nn as nn 
import torch.optim as optim
from  collections import Counter
import  warnings
import random 
from tqdm import tqdm
from torch.utils.data import DataLoader ,TensorDataset
import nltk 
from nltk.translate.bleu_score import sentence_bleu
warnings.filterwarnings('ignore')

In [2]:
DATA_PATH = "/home/bhavit/Desktop/Road_to_transformers/hind-english/hindi_english_parallel.csv"
language_data = pd.read_csv(DATA_PATH)

In [3]:
HINDI = language_data["hindi"][0:1000]
ENGLISH = language_data["english"][0:1000]

In [4]:
print(type(HINDI[0]))

<class 'str'>


In [5]:
def clean_tokenize(sentence ,start_token="<sos>",end_token="<eos>"):
    sentence = [f"{start_token} {s} {end_token}" for s in sentence]
    sentence = [s.strip()for s in  sentence if s.strip()]
    tokenize_sent = [s.split()for s in sentence]
    return tokenize_sent

In [6]:
HINDI = clean_tokenize(HINDI)
ENGLISH = clean_tokenize(ENGLISH)

In [7]:
print(ENGLISH[0:3])
print("\n\n")
print(HINDI[0:3])

[['<sos>', 'Give', 'your', 'application', 'an', 'accessibility', 'workout', '<eos>'], ['<sos>', 'Accerciser', 'Accessibility', 'Explorer', '<eos>'], ['<sos>', 'The', 'default', 'plugin', 'layout', 'for', 'the', 'bottom', 'panel', '<eos>']]



[['<sos>', 'अपने', 'अनुप्रयोग', 'को', 'पहुंचनीयता', 'व्यायाम', 'का', 'लाभ', 'दें', '<eos>'], ['<sos>', 'एक्सेर्साइसर', 'पहुंचनीयता', 'अन्वेषक', '<eos>'], ['<sos>', 'निचले', 'पटल', 'के', 'लिए', 'डिफोल्ट', 'प्लग-इन', 'खाका', '<eos>']]


will  create a  look up  table  for the language 


In [8]:
def build_vocab(tokenized_sent ,max_size = 1000 ,min_freq =2):
    counter = Counter([token for sentence in  tokenized_sent for token in  sentence])
    vocab = {"<pad>":0,"<sos>":1,"<eos>":2,"<unk>":3}

    for token ,freq in counter.most_common(max_size):
        if  freq >= min_freq and token not in vocab:
            vocab[token] = len(vocab)
          
    
    return vocab


In [9]:
hindi_vocab = build_vocab(HINDI)
english_vocab = build_vocab(ENGLISH)

In [10]:
print("HINDI VOCABLURY",hindi_vocab)
print("ENGLISH VOCABULARY",english_vocab)

HINDI VOCABLURY {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, 'को': 4, 'के': 5, 'है': 6, 'करें': 7, 's': 8, '(_': 9, 'नहीं': 10, 'का': 11, 'हाइलाइट': 12, 'लिए': 13, '%': 14, 'से': 15, 'गए': 16, 'अनुप्रयोग': 17, 'प्लग-इन': 18, 'दृश्य': 19, 'पहुंचनीयता': 20, 'की': 21, 'रंग': 22, 'और': 23, 'चुने': 24, 'कोई': 25, 'में': 26, 'सूची': 27, 'मान': 28, 'घटना': 29, 'स्थिति': 30, 'स्तंभ': 31, 'अंतराफलक': 32, 'पटल': 33, 'नाम': 34, 'डिफोल्ट': 35, 'कार्रवाई': 36, 'हैं': 37, 'किया': 38, 'गया': 39, 'सीमांत': 40, 'भराई': 41, 'विवरण': 42, 'फोकस': 43, 'है,': 44, 'ताजा': 45, 'एक्सेर्साइसर': 46, 'संपादित': 47, 'पसंद': 48, 'इस': 49, 'सभी': 50, 'नया': 51, 'समय': 52, 'बक्से': 53, 'मानिटर': 54, 'डेस्कटोप': 55, 'चुनी': 56, 'गई': 57, 'सारणी': 58, 'पंक्ति': 59, 'चयनीय': 60, 'शिशु': 61, 'सूची%': 62, 'd': 63, 'com)': 64, 'ऊपरी': 65, '(नोड)': 66, 'खाका': 67, 'शीर्षकः': 68, 'करता': 69, 'सूचना': 70, 'अन्वेषक': 71, 'पाठ': 72, 'ही': 73, 'एक': 74, 'दृश्य...': 75, 'उन': 76, 'प्लग-इनों': 77, 'जिन्हें': 78, 'रूप': 79, 'नि

extracting the indices based on the look up table

In [11]:
def sentence_to_indices(sentence, vocab):
    # print(sentence ,vocab)
    return [vocab.get(token, vocab["<unk>"]) for token in sentence]

# Convert tokenized sentences to indexed sequences
indexed_hindi = [sentence_to_indices(sentence, hindi_vocab) for sentence in HINDI]
indexed_english = [sentence_to_indices(sentence, english_vocab) for sentence in ENGLISH]

print("Indexed HINDI Sentences:", indexed_hindi)
print("Indexed ENGLISH Sentences:", indexed_english)

Indexed HINDI Sentences: [[1, 131, 17, 4, 20, 132, 11, 133, 134, 2], [1, 46, 20, 71, 2], [1, 373, 33, 5, 13, 35, 18, 67, 2], [1, 65, 33, 5, 13, 35, 18, 67, 2], [1, 76, 77, 21, 27, 78, 35, 79, 15, 80, 38, 39, 6, 2], [1, 121, 4, 12, 142, 2], [1, 81, 82, 66, 4, 143, 52, 12, 53, 21, 121, 2], [1, 40, 144, 5, 22, 4, 12, 7, 2], [1, 12, 83, 16, 40, 11, 22, 23, 145, 2], [1, 41, 5, 22, 4, 12, 7, 2], [1, 12, 38, 39, 41, 11, 22, 23, 146, 2], [1, 147, 148, 2], [1, 49, 52, 149, 150, 38, 39, 151, 152, 84, 153, 154, 26, 155, 7, 2], [1, 156, 85, 4, 157, 2], [1, 158, 2], [1, 125, 2], [1, 28, 2], [1, 159, 86, 2], [1, 49, 52, 24, 16, 87, 15, 160, 161, 5, 13, 88, 86, 2], [1, 29, 54, 2], [1, 162, 4, 54, 7, 9, 163, 2], [1, 89, 4, 164, 165, 166, 2], [1, 50, 2], [1, 24, 16, 17, 2], [1, 24, 16, 87, 2], [1, 167, 2], [1, 29, 54, 2], [1, 24, 16, 168, 23, 169, 15, 170, 171, 172, 173, 174, 175, 176, 6, 2], [1, 90, 177, 29, 4, 12, 178, 2], [1, 29, 179, 180, 181, 2], [1, 29, 182, 183, 2], [1, 25, 42, 10, 2], [1, 119, 

In [12]:
#  shuffling and splitting the data 
data = list(zip(indexed_hindi,indexed_english))
random.seed(42)
random.shuffle(data)
split_idx = int(0.8 *len(data))
print("SPLIT INDEX",split_idx)

train_data = data[:split_idx]
test_data = data[split_idx:]



SPLIT INDEX 800


In [13]:
train_hindi, train_english = zip(*train_data)
test_hindi, test_english = zip(*test_data)

In [14]:
train_hindi[0]

[1, 46, 2]

In [15]:
def pad_sentence(sentences,pad_token=0):
    print(sentences)
    max_len = max(len(sentence) for sentence in  sentences)
    return [sentence + [pad_token] *(max_len - len(sentence)) for sentence in sentences]


In [16]:
print(len(train_english),len(train_hindi))
print(len(test_english),len(test_hindi))

800 800
200 200


In [17]:
train_pad_english = pad_sentence(train_english)
train_pad_hindi = pad_sentence(train_hindi)
test_pad_english = pad_sentence(test_english)
test_pad_hindi = pad_sentence(test_hindi)

train_eng_tensor = torch.tensor(train_pad_english)
train_hind_tensor = torch.tensor(train_pad_hindi)
test_hindi_tensor = torch.tensor(test_pad_hindi)
test_eng_tensor = torch.tensor(test_pad_english)

([1, 43, 2], [1, 240, 2], [1, 212, 213, 8, 33, 36, 214, 2], [1, 246, 2], [1, 99, 2], [1, 403, 2], [1, 212, 213, 8, 33, 36, 214, 2], [1, 340, 341, 2], [1, 323, 324, 325, 15, 326, 2], [1, 9, 5, 270, 5, 81, 271, 5, 81, 2], [1, 31, 355, 2], [1, 413, 414, 2], [1, 134, 2], [1, 131, 4, 132, 65, 2], [1, 218, 2], [1, 76, 77, 2], [1, 349, 2], [1, 170, 69, 2], [1, 308, 2], [1, 306, 2], [1, 177, 2], [1, 100, 8, 6, 3, 363, 8, 6, 410, 411, 2], [1, 7, 59, 13, 8, 6, 18, 32, 2], [1, 195, 2], [1, 17, 66, 34, 143, 2], [1, 402, 92, 2], [1, 4, 64, 130, 2], [1, 321, 24, 322, 2], [1, 348, 2], [1, 31, 355, 2], [1, 153, 4, 68, 2], [1, 294, 295, 296, 297, 298, 299, 300, 2], [1, 7, 20, 11, 6, 50, 16, 2], [1, 74, 66, 75, 12, 2], [1, 250, 5, 78, 10, 251, 37, 73, 2], [1, 351, 2], [1, 17, 59, 13, 2], [1, 4, 277, 2], [1, 4, 176, 2], [1, 396, 2], [1, 100, 2], [1, 196, 2], [1, 4, 42, 2], [1, 4, 356, 2], [1, 28, 193, 194, 2], [1, 3, 363, 2], [1, 349, 2], [1, 312, 16, 2], [1, 17, 318, 2], [1, 240, 2], [1, 99, 2], [1, 321

In [18]:
# creating data loaders 
batch_size = 8
train_dataset = TensorDataset(train_hind_tensor,train_eng_tensor)
test_dataset = TensorDataset(test_hindi_tensor,test_eng_tensor)

train_dataset = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_dataset = DataLoader(test_dataset,batch_size=batch_size,shuffle=False)

In [19]:
# Check sample batches
# for batch_idx, (hindi_batch, english_batch) in enumerate(train_data):
    # print(f"Train Batch {batch_idx + 1}")
    # print("HINDI batch:\n", hindi_batch)
    # print("ENGLISH batch:\n", english_batch)

# for batch_idx, (hindi_batch, english_batch) in enumerate(train_data):
    # print(f"Test Batch {batch_idx + 1}")
    # print("HINDI batch:\n", hindi_batch)
    # print("ENGLISH batch:\n", english_batch)

ENCODER ARCH

lstm excpets the input shape to  be [sequence_length,batch_size,embedding size]
The embedding input shape will  be  sequence length  and batch size by defualt in data loader its in shape of batchsize and sequence lenght so  we need to  transpose it 

The output from  the LSTM  wll  be of  shape [num_layers, batch_size, hidden_size]
This hidden state represents the final hidden state of the LSTM for each sequence in the batch after processing all time steps.

This final cell state will be passed to the next time step if the model continues generating or processing sequences.

HINDI  BATCH  BEFORE TRANSPOSE torch.Size([2, 31])
HINDI  BATCH  AFTER TRANSPOSE torch.Size([31, 2])
SHAPE  OF EMBEDDING  torch.Size([31, 2, 256])
Batch 1:
Hidden state shape: torch.Size([2, 2, 512])
Cell state shape: torch.Size([2, 2, 512])


In [20]:
class Encoder(nn.Module):
    def __init__(self,input_size,embedding_size,hidden_size,num_layers,dropout):
        super(Encoder,self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout)


    def forward(self,x):
        embedding = self.embedding(x)
        embedding = self.dropout(embedding)
        outputs ,(hidden,cell) = self.lstm(embedding)


        return hidden,cell

Decoder 

In [21]:
class Decoder(nn.Module):
    def __init__ (self,input_size,embedding_size,hidden_size,output_size,num_layers,dropout):
        super(Decoder,self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout)
        self.fc = nn.Linear(hidden_size,output_size)

    def forward(self,x,hidden,cell):
        x = x.unsqueeze(0) # Add sequence length dimension
        embedding = self.embedding(x)
        embedding = self.dropout(embedding)
        output , (hidden,cell) = self.lstm(embedding,(hidden,cell))
        prediction = self.fc(output)
        prediction = prediction.squeeze(0) # Remove sequence length dimension

        return prediction ,hidden,cell

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder ,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder 

    
    def forward(self, source ,target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english_vocab)

        outputs = torch.zeros(target_len,batch_size,target_vocab_size).to(device=device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


In [None]:
input_size_encoder = len(hindi_vocab)
input_size_decoder = len(english_vocab)
output_size = len(english_vocab)
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
learning_rate = 0.001
num_epochs = 150
LOSS = []



Training

In [25]:
encoder = Encoder(input_size=input_size_encoder,
                  embedding_size=encoder_embedding_size,
                  hidden_size=hidden_size,
                  num_layers=num_layers,
                  dropout=enc_dropout).to(device)

decoder = Decoder(input_size=input_size_decoder,
                  embedding_size=decoder_embedding_size,
                  hidden_size=hidden_size,
                  output_size=output_size,
                  num_layers=num_layers,
                  dropout=dec_dropout).to(device)



model =  Seq2Seq(encoder,decoder).to(device)
optimizer = optim.Adam(model.parameters(),lr = learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=english_vocab["<pad>"])

                  


In [None]:

for epoch in  range(num_epochs):
    print(f"EPOCH {epoch + 1}/{num_epochs}")
    model.train()
    epoch_loss = 0

    # use tqdm for progress tracking 
    loop = tqdm(enumerate(train_dataset),total=len(train_dataset),leave=True)
    for batch_idx , (source,target) in loop :
        source = source.to(device).transpose(0,1) # Transpose to sequence_length ,batch_size because LSTM expects that way
        target = target.to(device).transpose(0,1)

        # FORWARD PASS 
        optimizer.zero_grad()
        output = model(source,target)

        # Reshape for loss computation
        # Flatten output: [target_len, batch_size, target_vocab_size] -> [(target_len * batch_size), target_vocab_size]
        # Flatten target: [target_len, batch_size] -> [(target_len * batch_size)]

        output = output[1:].reshape(-1 ,output.shape[2]) # skip <sos> token
        target = target[1:].reshape(-1)

        loss = criterion(output,target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) #  For exploding gradient problems in LSTM
        optimizer.step()

        epoch_loss += loss.item()

        loop.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())  

        total_loss = f"Epoch Loss: {epoch_loss / len(train_dataset):.4f}"
        LOSS.append(total_loss)
    print(total_loss)



EPOCH 1/150


Epoch [1/150]: 100%|██████████| 100/100 [00:05<00:00, 16.71it/s, loss=3.98]


Epoch Loss: 4.6152
EPOCH 2/150


Epoch [2/150]: 100%|██████████| 100/100 [00:05<00:00, 17.96it/s, loss=4.01]


Epoch Loss: 3.9882
EPOCH 3/150


Epoch [3/150]: 100%|██████████| 100/100 [00:05<00:00, 18.12it/s, loss=3.12]


Epoch Loss: 3.4540
EPOCH 4/150


Epoch [4/150]: 100%|██████████| 100/100 [00:05<00:00, 18.05it/s, loss=2.6]


Epoch Loss: 3.0565
EPOCH 5/150


Epoch [5/150]: 100%|██████████| 100/100 [00:05<00:00, 18.29it/s, loss=2.8]


Epoch Loss: 2.7814
EPOCH 6/150


Epoch [6/150]: 100%|██████████| 100/100 [00:05<00:00, 18.26it/s, loss=2]  


Epoch Loss: 2.5195
EPOCH 7/150


Epoch [7/150]: 100%|██████████| 100/100 [00:05<00:00, 18.39it/s, loss=2.19]


Epoch Loss: 2.3088
EPOCH 8/150


Epoch [8/150]: 100%|██████████| 100/100 [00:05<00:00, 18.45it/s, loss=1.08]


Epoch Loss: 2.0922
EPOCH 9/150


Epoch [9/150]: 100%|██████████| 100/100 [00:05<00:00, 18.53it/s, loss=2.09]


Epoch Loss: 1.8194
EPOCH 10/150


Epoch [10/150]: 100%|██████████| 100/100 [00:05<00:00, 18.52it/s, loss=1.7] 


Epoch Loss: 1.5632
EPOCH 11/150


Epoch [11/150]: 100%|██████████| 100/100 [00:05<00:00, 18.43it/s, loss=1.04]


Epoch Loss: 1.3622
EPOCH 12/150


Epoch [12/150]: 100%|██████████| 100/100 [00:05<00:00, 18.61it/s, loss=0.884]


Epoch Loss: 1.2631
EPOCH 13/150


Epoch [13/150]: 100%|██████████| 100/100 [00:05<00:00, 18.64it/s, loss=1.68]


Epoch Loss: 1.0297
EPOCH 14/150


Epoch [14/150]: 100%|██████████| 100/100 [00:05<00:00, 18.46it/s, loss=0.38]


Epoch Loss: 0.8659
EPOCH 15/150


Epoch [15/150]: 100%|██████████| 100/100 [00:05<00:00, 18.45it/s, loss=0.614]


Epoch Loss: 0.7211
EPOCH 16/150


Epoch [16/150]: 100%|██████████| 100/100 [00:05<00:00, 18.39it/s, loss=0.509]


Epoch Loss: 0.5779
EPOCH 17/150


Epoch [17/150]: 100%|██████████| 100/100 [00:05<00:00, 18.10it/s, loss=0.234]


Epoch Loss: 0.4707
EPOCH 18/150


Epoch [18/150]: 100%|██████████| 100/100 [00:05<00:00, 18.17it/s, loss=0.892]


Epoch Loss: 0.3789
EPOCH 19/150


Epoch [19/150]: 100%|██████████| 100/100 [00:05<00:00, 18.10it/s, loss=0.404]


Epoch Loss: 0.3379
EPOCH 20/150


Epoch [20/150]:  87%|████████▋ | 87/100 [00:04<00:00, 18.07it/s, loss=0.269] 


KeyboardInterrupt: 