In [1]:
import torch
import torch.nn as nn
import pandas as pd

In [3]:
df = pd.read_csv("Hindi_English_Truncated_Corpus.csv")
df = df.drop(columns=["source"])
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
import re
from collections import Counter

In [6]:
dataset = list(zip(df["english_sentence"], df["hindi_sentence"]))
dataset[0]

('politicians do not have permission to do what needs to be done.',
 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .')

In [7]:
dataset = [(eng, hindi) for eng, hindi in dataset if isinstance(eng, str) and isinstance(hindi, str)]

In [8]:
def tokenize(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r"[^a-záéíóúüñ¿¡\u0900-\u097F]+", " ", text)
    return text.strip().split()


def build_vocab(sentences, min_freq=2, specials=["<PAD>", "<SOS>", "<EOS>", "<UNK>"]):
    counter = Counter()
    for sent in sentences:
        counter.update(tokenize(sent))
    
    vocab = {tok: idx for idx, tok in enumerate(specials)}
    for word, freq in counter.items():
        if freq >= min_freq and word not in vocab:
            vocab[word] = len(vocab)
    return vocab 

In [9]:
src_sentences = [eng for eng, _ in dataset]
tgt_sentences = [hindi for _, hindi in dataset]

src_vocab = build_vocab(src_sentences)
tgt_vocab = build_vocab(tgt_sentences)

In [10]:
from torch.utils.data import Dataset

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, src_vocab, tgt_vocab, src_max_len = 25, tgt_max_len = 25):
        self.dataset = dataset
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_max_len = src_max_len
        self.tgt_max_len = tgt_max_len

    def tokenize(self, text, is_target=False):
        if not isinstance(text, str):
            return []
        text = text.lower()
        if is_target:
            text = re.sub(r"[^a-záéíóúüñ¿¡\u0900-\u097F]+", " ", text)
        else:
            text = re.sub(r"[^a-záéíóúüñ¿¡]+", " ", text)
        return text.strip().split()

    def numericalize(self, text, vocab, max_len, add_sos_eos=False, is_target=False):
        tokens = self.tokenize(text, is_target=is_target)
        ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]

        if add_sos_eos:
            ids = [vocab["<SOS>"]] + ids + [vocab["<EOS>"]]

        ids = ids + [vocab["<PAD>"]] * (max_len - len(ids))
        return ids[:max_len]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        src_text, tgt_text = self.dataset[idx]
        src_ids = self.numericalize(src_text, self.src_vocab, self.src_max_len)
        tgt_ids = self.numericalize(tgt_text, self.tgt_vocab, self.tgt_max_len, add_sos_eos=True, is_target=True)
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embeddings = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
    
    def forward(self, X):
        embedded_X = self.embeddings(X) 
        outputs, (hidden, cell) = self.lstm(embedded_X)
        return outputs, (hidden, cell)

In [13]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim*2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias = False)
    
    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.size(1)

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
        attention = self.v(energy).squeeze(2)

        return torch.softmax(attention, dim = 1)

In [None]:
class DecoderWithAttention(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.attention = Attention(hidden_dim)
    
    def forward(self, x, hidden, cell, encoder_outputs):
        x = x.unsqueeze(1)
        embedded = self.embedding(x) 

        attn_weights = self.attention(hidden[-1], encoder_outputs)
        attn_weights = attn_weights.unsqueeze(1)

        weighted_sum = torch.bmm(attn_weights, encoder_outputs)

        lstm_input = torch.cat((embedded, weighted_sum), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))

        output = torch.cat((output.squeeze(1), weighted_sum.squeeze(1)), dim=1)
        prediction = self.fc(output)
        
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(src.device)
        
        enc_outputs, (hidden, cell) = self.encoder(src)
        input_tok = tgt[:, 0]
        
        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input_tok, hidden, cell, enc_outputs)
            outputs[:, t, :] = output
            input_tok = tgt[:, t]
        return outputs


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

embed_dim = 256
hidden_dim = 512
num_layers = 1
batch_size = 64
num_epochs = 30
learning_rate = 0.001
teacher_forcing_ratio = 0.5

In [17]:
max_len = 20
tdataset = TranslationDataset(dataset, src_vocab, tgt_vocab, src_max_len = max_len, tgt_max_len = max_len)
dataloader = DataLoader(tdataset, batch_size = batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
encoder = Encoder(len(src_vocab), embed_dim, hidden_dim, num_layers)
decoder = DecoderWithAttention(len(tgt_vocab), embed_dim, hidden_dim, num_layers)
model = Seq2Seq(encoder, decoder).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
from tqdm import tqdm

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for src_batch, tgt_batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        
        optimizer.zero_grad()
        output = model(src_batch, tgt_batch)
        
        output_dim = output.shape[-1]
        output = output[:, 1:, :].contiguous().view(-1, output_dim)
        tgt = tgt_batch[:,1:].reshape(-1)
        
        loss = criterion(output, tgt)
        loss.backward() 
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader):.4f}")

Epoch 1/30: 100%|██████████| 1994/1994 [14:14<00:00,  2.33it/s]


Epoch 1/30, Loss: 5.4137


Epoch 2/30: 100%|██████████| 1994/1994 [14:31<00:00,  2.29it/s]


Epoch 2/30, Loss: 3.4787


Epoch 3/30: 100%|██████████| 1994/1994 [14:27<00:00,  2.30it/s]


Epoch 3/30, Loss: 2.5139


Epoch 4/30: 100%|██████████| 1994/1994 [15:10<00:00,  2.19it/s]


Epoch 4/30, Loss: 1.9434


Epoch 5/30: 100%|██████████| 1994/1994 [15:08<00:00,  2.20it/s]


Epoch 5/30, Loss: 1.5706


Epoch 6/30: 100%|██████████| 1994/1994 [15:20<00:00,  2.17it/s]


Epoch 6/30, Loss: 1.2856


Epoch 7/30: 100%|██████████| 1994/1994 [15:16<00:00,  2.18it/s]


Epoch 7/30, Loss: 1.0650


Epoch 8/30: 100%|██████████| 1994/1994 [15:25<00:00,  2.15it/s]


Epoch 8/30, Loss: 0.8911


Epoch 9/30: 100%|██████████| 1994/1994 [14:51<00:00,  2.24it/s]


Epoch 9/30, Loss: 0.7555


Epoch 10/30: 100%|██████████| 1994/1994 [14:50<00:00,  2.24it/s]


Epoch 10/30, Loss: 0.6500


Epoch 11/30: 100%|██████████| 1994/1994 [14:50<00:00,  2.24it/s]


Epoch 11/30, Loss: 0.5671


Epoch 12/30: 100%|██████████| 1994/1994 [14:49<00:00,  2.24it/s]


Epoch 12/30, Loss: 0.5003


Epoch 13/30: 100%|██████████| 1994/1994 [14:49<00:00,  2.24it/s]


Epoch 13/30, Loss: 0.4493


Epoch 14/30: 100%|██████████| 1994/1994 [14:50<00:00,  2.24it/s]


Epoch 14/30, Loss: 0.4087


Epoch 15/30: 100%|██████████| 1994/1994 [14:49<00:00,  2.24it/s]


Epoch 15/30, Loss: 0.3778


Epoch 16/30: 100%|██████████| 1994/1994 [14:48<00:00,  2.24it/s]


Epoch 16/30, Loss: 0.3493


Epoch 17/30: 100%|██████████| 1994/1994 [14:47<00:00,  2.25it/s]


Epoch 17/30, Loss: 0.3290


Epoch 18/30: 100%|██████████| 1994/1994 [14:50<00:00,  2.24it/s]


Epoch 18/30, Loss: 0.3124


Epoch 19/30: 100%|██████████| 1994/1994 [14:49<00:00,  2.24it/s]


Epoch 19/30, Loss: 0.2995


Epoch 20/30: 100%|██████████| 1994/1994 [14:50<00:00,  2.24it/s]


Epoch 20/30, Loss: 0.2841


Epoch 21/30: 100%|██████████| 1994/1994 [14:50<00:00,  2.24it/s]


Epoch 21/30, Loss: 0.2744


Epoch 22/30: 100%|██████████| 1994/1994 [14:49<00:00,  2.24it/s]


Epoch 22/30, Loss: 0.2682


Epoch 23/30: 100%|██████████| 1994/1994 [09:52<00:00,  3.36it/s]


Epoch 23/30, Loss: 0.2588


Epoch 24/30: 100%|██████████| 1994/1994 [08:36<00:00,  3.86it/s]


Epoch 24/30, Loss: 0.2529


Epoch 25/30: 100%|██████████| 1994/1994 [08:38<00:00,  3.85it/s]


Epoch 25/30, Loss: 0.2482


Epoch 26/30: 100%|██████████| 1994/1994 [08:39<00:00,  3.84it/s]


Epoch 26/30, Loss: 0.2442


Epoch 27/30: 100%|██████████| 1994/1994 [08:38<00:00,  3.84it/s]


Epoch 27/30, Loss: 0.2388


Epoch 28/30: 100%|██████████| 1994/1994 [08:37<00:00,  3.85it/s]


Epoch 28/30, Loss: 0.2347


Epoch 29/30: 100%|██████████| 1994/1994 [08:36<00:00,  3.86it/s]


Epoch 29/30, Loss: 0.2312


Epoch 30/30: 100%|██████████| 1994/1994 [08:35<00:00,  3.86it/s]

Epoch 30/30, Loss: 0.2298





In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from tqdm import tqdm
import random
import torch

def calculate_bleu(model, dataset, src_vocab, tgt_vocab, max_len, sample_size=5, subset_size=500, device="cpu"):
    model.eval()
    candidates = []
    references = []

    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    smooth_fn = SmoothingFunction().method1

    subset = random.sample(dataset, min(subset_size, len(dataset)))

    for eng, hindi in tqdm(subset, desc="Calculating BLEU"):
        src_tokens = eng.lower().split()
        src_ids = [src_vocab.get(t, src_vocab["<UNK>"]) for t in src_tokens]
        src_ids = src_ids[:max_len]
        src_ids = src_ids + [src_vocab["<PAD>"]] * (max_len - len(src_ids))
        src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            encoder_outputs, (hidden, cell) = model.encoder(src)
            input_tok = torch.tensor([tgt_vocab["<SOS>"]], dtype=torch.long, device=device)

            result = []
            for _ in range(max_len):
                output, hidden, cell = model.decoder(input_tok, hidden, cell, encoder_outputs)

                next_tok = output.argmax(1).item()

                if next_tok in (tgt_vocab["<EOS>"], tgt_vocab["<PAD>"]):
                    break

                result.append(next_tok)
                input_tok = torch.tensor([next_tok], dtype=torch.long, device=device)

        pred_sentence = [inv_tgt_vocab[i] for i in result if i in inv_tgt_vocab]
        candidates.append(pred_sentence)

        ref_tokens = hindi.strip().split()
        references.append([ref_tokens])

    bleu_score = corpus_bleu(references, candidates, smoothing_function=smooth_fn)

    print(f"\nSample predictions ({sample_size}):\n")
    for i in range(min(sample_size, len(subset))):
        eng, hindi = subset[i]
        pred = " ".join(candidates[i])
        print(f"ENG: {eng}")
        print(f"REF: {hindi}")
        print(f"PRED: {pred}")
        print("-" * 40)

    return bleu_score


In [27]:
bleu = calculate_bleu(model, dataset, src_vocab, tgt_vocab, max_len, sample_size=500, device = device)
print("BLEU score:", bleu)  

Calculating BLEU: 100%|██████████| 500/500 [00:07<00:00, 62.81it/s]


Sample predictions (500):

ENG: is that feeling and reality get out of whack,
REF: जब अचानक अहसास और सच्चाई बाहर आते हैं
PRED: की ये अहसास और सच्चाई को महसूस करने लगा है
----------------------------------------
ENG: But the hunger for mother 's affection , never appeased in childhood , was to survive in the son as a recurring longing for feminine affection and care .
REF: मां के स्नेह की भूख , जो कि बालक रवि के बचपन में कभी मिट नहीं पाई , और जो स्त्रियों के प्रति अनुराग और स्नेह की आकांक्षा के रूप में बार बार प्रकट होती रही .
PRED: लेकिन माता के गर्भ में भूख की कमी के लिए उनकी आस्था <UNK> और कभी कोई मामूली नहीं <UNK> क्योंकि
----------------------------------------
ENG: This is clear from the cases found in the Akhabararrat -LRB- provincial -RRB- of Ajmer of the 46th year of Aurangzeb 's reign , discovered in the Deccan .
REF: यह बात दक्षिण से प्राप्त , औरंगजेब के शासन के 46वें वर्ष के अजमेर सूबे के अखबारात से पता चलती है .
PRED: यह साफ जानकारी <UNK> <UNK> के <UNK> के द्वारा जून की भू


