In [1]:
import torch
import torch.nn as nn
import pandas as pd

# Dataset

In [3]:
df = pd.read_csv("Hindi_English_Truncated_Corpus.csv")
df = df.drop(columns=["source"])
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
import re
from collections import Counter

In [5]:
dataset = list(zip(df["english_sentence"], df["hindi_sentence"]))
dataset[0]

('politicians do not have permission to do what needs to be done.',
 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .')

In [6]:
dataset = [(eng, hindi) for eng, hindi in dataset if isinstance(eng, str) and isinstance(hindi, str)]

## PreProcess

In [7]:
def tokenize(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r"[^a-záéíóúüñ¿¡\u0900-\u097F]+", " ", text)
    return text.strip().split()


def build_vocab(sentences, min_freq=2, specials=["<PAD>", "<SOS>", "<EOS>", "<UNK>"]):
    counter = Counter()
    for sent in sentences:
        counter.update(tokenize(sent))
    
    vocab = {tok: idx for idx, tok in enumerate(specials)}
    for word, freq in counter.items():
        if freq >= min_freq and word not in vocab:
            vocab[word] = len(vocab)
    return vocab 

In [8]:
src_sentences = [eng for eng, _ in dataset]
tgt_sentences = [hindi for _, hindi in dataset]

src_vocab = build_vocab(src_sentences)
tgt_vocab = build_vocab(tgt_sentences)

## Defining the Dataset Class

In [9]:
from torch.utils.data import Dataset

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, src_vocab, tgt_vocab, src_max_len = 25, tgt_max_len = 25):
        self.dataset = dataset
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_max_len = src_max_len
        self.tgt_max_len = tgt_max_len

    def tokenize(self, text, is_target=False):
        if not isinstance(text, str):
            return []
        text = text.lower()
        if is_target:
            text = re.sub(r"[^a-záéíóúüñ¿¡\u0900-\u097F]+", " ", text)
        else:
            text = re.sub(r"[^a-záéíóúüñ¿¡]+", " ", text)
        return text.strip().split()

    def numericalize(self, text, vocab, max_len, add_sos_eos=False, is_target=False):
        tokens = self.tokenize(text, is_target=is_target)
        ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]

        if add_sos_eos:
            ids = [vocab["<SOS>"]] + ids + [vocab["<EOS>"]]

        ids = ids + [vocab["<PAD>"]] * (max_len - len(ids))
        return ids[:max_len]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        src_text, tgt_text = self.dataset[idx]
        src_ids = self.numericalize(src_text, self.src_vocab, self.src_max_len)
        tgt_ids = self.numericalize(tgt_text, self.tgt_vocab, self.tgt_max_len, add_sos_eos=True, is_target=True)
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

# Defining the architefcture

## Encoder

In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers = 1):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
    
    def forward(self, X):
        embedded_X = self.embeddings(X)
        outputs, (hidden, cell) = self.lstm(embedded_X)
        return outputs, (hidden, cell)

## Decoder


In [12]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

## Seq2Seq forward pass

In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(src.device)
            
        enc_outputs, (hidden, cell) = self.encoder(src)
        input_tok = tgt[:,0]
        
        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input_tok, hidden, cell)
            outputs[:,t,:] = output
            input_tok = tgt[:,t]
        
        return outputs

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

embed_dim = 256
hidden_dim = 512
num_layers = 1
batch_size = 64
num_epochs = 30
learning_rate = 0.001
teacher_forcing_ratio = 0.5

In [15]:
max_len = 20
tdataset = TranslationDataset(dataset, src_vocab, tgt_vocab, src_max_len = max_len, tgt_max_len = max_len)
dataloader = DataLoader(tdataset, batch_size = batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
encoder = Encoder(len(src_vocab), embed_dim, hidden_dim, num_layers)
decoder = Decoder(len(tgt_vocab), embed_dim, hidden_dim, num_layers)
model = Seq2Seq(encoder, decoder).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training

In [18]:
from tqdm import tqdm

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for src_batch, tgt_batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        
        optimizer.zero_grad()
        output = model(src_batch, tgt_batch)
        
        output_dim = output.shape[-1]
        output = output[:, 1:, :].contiguous().view(-1, output_dim)
        tgt = tgt_batch[:,1:].reshape(-1)
        
        loss = criterion(output, tgt)
        loss.backward() 
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader):.4f}")

Epoch 1/30: 100%|██████████| 1994/1994 [10:38<00:00,  3.12it/s]


Epoch 1/30, Loss: 6.0284


Epoch 2/30: 100%|██████████| 1994/1994 [10:46<00:00,  3.08it/s]


Epoch 2/30, Loss: 4.5167


Epoch 3/30: 100%|██████████| 1994/1994 [10:38<00:00,  3.12it/s]


Epoch 3/30, Loss: 3.6125


Epoch 4/30: 100%|██████████| 1994/1994 [10:39<00:00,  3.12it/s]


Epoch 4/30, Loss: 3.0159


Epoch 5/30: 100%|██████████| 1994/1994 [11:11<00:00,  2.97it/s]


Epoch 5/30, Loss: 2.5755


Epoch 6/30: 100%|██████████| 1994/1994 [11:05<00:00,  3.00it/s]


Epoch 6/30, Loss: 2.2342


Epoch 7/30: 100%|██████████| 1994/1994 [11:14<00:00,  2.96it/s]


Epoch 7/30, Loss: 1.9599


Epoch 8/30: 100%|██████████| 1994/1994 [11:17<00:00,  2.94it/s]


Epoch 8/30, Loss: 1.7351


Epoch 9/30: 100%|██████████| 1994/1994 [11:17<00:00,  2.94it/s]


Epoch 9/30, Loss: 1.5457


Epoch 10/30: 100%|██████████| 1994/1994 [11:13<00:00,  2.96it/s]


Epoch 10/30, Loss: 1.3864


Epoch 11/30: 100%|██████████| 1994/1994 [11:14<00:00,  2.95it/s]


Epoch 11/30, Loss: 1.2507


Epoch 12/30: 100%|██████████| 1994/1994 [10:57<00:00,  3.03it/s]


Epoch 12/30, Loss: 1.1337


Epoch 13/30: 100%|██████████| 1994/1994 [10:57<00:00,  3.03it/s]


Epoch 13/30, Loss: 1.0339


Epoch 14/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 14/30, Loss: 0.9462


Epoch 15/30: 100%|██████████| 1994/1994 [10:57<00:00,  3.03it/s]


Epoch 15/30, Loss: 0.8723


Epoch 16/30: 100%|██████████| 1994/1994 [10:55<00:00,  3.04it/s]


Epoch 16/30, Loss: 0.8055


Epoch 17/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 17/30, Loss: 0.7503


Epoch 18/30: 100%|██████████| 1994/1994 [10:55<00:00,  3.04it/s]


Epoch 18/30, Loss: 0.7012


Epoch 19/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 19/30, Loss: 0.6578


Epoch 20/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 20/30, Loss: 0.6201


Epoch 21/30: 100%|██████████| 1994/1994 [10:55<00:00,  3.04it/s]


Epoch 21/30, Loss: 0.5876


Epoch 22/30: 100%|██████████| 1994/1994 [10:55<00:00,  3.04it/s]


Epoch 22/30, Loss: 0.5587


Epoch 23/30: 100%|██████████| 1994/1994 [10:55<00:00,  3.04it/s]


Epoch 23/30, Loss: 0.5327


Epoch 24/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 24/30, Loss: 0.5103


Epoch 25/30: 100%|██████████| 1994/1994 [10:58<00:00,  3.03it/s]


Epoch 25/30, Loss: 0.4907


Epoch 26/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 26/30, Loss: 0.4725


Epoch 27/30: 100%|██████████| 1994/1994 [10:57<00:00,  3.03it/s]


Epoch 27/30, Loss: 0.4561


Epoch 28/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 28/30, Loss: 0.4416


Epoch 29/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]


Epoch 29/30, Loss: 0.4309


Epoch 30/30: 100%|██████████| 1994/1994 [10:56<00:00,  3.04it/s]

Epoch 30/30, Loss: 0.4183





# BLEU score calculation

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from tqdm import tqdm
import random
import torch

def calculate_bleu(model, dataset, src_vocab, tgt_vocab, max_len, sample_size=5, subset_size=500, device="cpu"):
    model.eval()
    candidates = []
    references = []

    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    smooth_fn = SmoothingFunction().method1

    subset = random.sample(dataset, min(subset_size, len(dataset)))

    for eng, hindi in tqdm(subset, desc="Calculating BLEU"):
        src_tokens = eng.lower().split()
        src_ids = [src_vocab.get(t, src_vocab["<UNK>"]) for t in src_tokens]
        src_ids = src_ids[:max_len]
        src_ids = src_ids + [src_vocab["<PAD>"]] * (max_len - len(src_ids))
        src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            encoder_outputs, (hidden, cell) = model.encoder(src)
            input_tok = torch.tensor([tgt_vocab["<SOS>"]], dtype=torch.long, device=device)

            result = []
            for _ in range(max_len):
                output, hidden, cell = model.decoder(input_tok, hidden, cell)

                next_tok = output.argmax(1).item()

                if next_tok in (tgt_vocab["<EOS>"], tgt_vocab["<PAD>"]):
                    break

                result.append(next_tok)
                input_tok = torch.tensor([next_tok], dtype=torch.long, device=device)


        pred_sentence = [inv_tgt_vocab[i] for i in result if i in inv_tgt_vocab]
        candidates.append(pred_sentence)
        ref_tokens = hindi.strip().split()

        references.append([ref_tokens])

    bleu_score = corpus_bleu(references, candidates, smoothing_function=smooth_fn)

    # Print sample predictions
    print(f"\nSample predictions ({sample_size}):\n")
    for i in range(min(sample_size, len(subset))):
        eng, hindi = subset[i]
        pred = " ".join(candidates[i])
        print(f"ENG: {eng}")
        print(f"REF: {hindi}")
        print(f"PRED: {pred}")
        print("-" * 40)

    return bleu_score


In [24]:
bleu = calculate_bleu(model, dataset, src_vocab, tgt_vocab, max_len, sample_size=500, device = device)
print("BLEU score:", bleu)  

Calculating BLEU: 100%|██████████| 500/500 [00:04<00:00, 120.37it/s]



Sample predictions (500):

ENG: The Chinese script is one of the oldest written script in the world and is still in use today and has been the source of many inventions.
REF: चीन की लिखित भाषा प्रणाली विश्व की सबसे पुरानी है जो आज तक उपयोग में लायी जा रही है और जो कई आविष्कारों का स्रोत भी है।
PRED: चीन की लिखित भाषा प्रणाली विश्व की सबसे पुरानी है जो आज तक उपयोग में लायी जा रही है लेखक
----------------------------------------
ENG: teachers and students, and I like this analogy.
REF: शिक्षक और छात्र, और मुझे यह उपमा पसंद है.
PRED: शिक्षक और अध्यापकों ने इसे मैं चुना ।
----------------------------------------
ENG: Historically it has been proved that every Kuran available on earth is the copy of original which is appeared in Hajarat Muhammad (Sall.).
REF: ऐतिहासिक रूप से यह सिद्ध हो चुका है कि इस धरती पर उपस्थित हर क़ुरान की प्रति वही मूल प्रति का प्रतिरूप है जो हज़रत मुहम्मद (सल्ल.) ‎पर अवतरित हुई थी।
PRED: ऐतिहासिक रूप से यह सिद्ध हो चुका है कि इस धरती पर उपस्थित हर क़ुरान की प्रति व

In [22]:
src_batch, tgt_batch = next(iter(dataloader))
print(src_batch.shape, tgt_batch.shape)
tgt_batch[0]

torch.Size([64, 20]) torch.Size([64, 20])


tensor([    1, 12198, 22170, 22171, 10919,  2117,     5,  2180,     2,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])