## Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


## Using device

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
#device='cpu'

Using device: cuda


## Load Custom CSV Dataset

In [3]:
from pathlib import Path
root=Path("data")
root.mkdir(exist_ok=True)
path= root / "hindi_english_parallel.csv"
data = pd.read_csv(path)

In [4]:
df=data.sample(5050,random_state=42)

In [5]:
df=df.dropna(axis=0)

In [6]:
df=df.drop_duplicates()

In [7]:
len(df)

5007

In [8]:
src_texts = df.iloc[:, 1].astype(str).str.lower().tolist()
tgt_texts = df.iloc[:, 0].astype(str).tolist()

In [9]:
tgt_texts[:10]

['बडे पैमाने पर सुनामी से प्रभावीत जापान में 4 दिनो बाद कोई अभी तक जिंदा होने की आशाएँ लुप्त हो रही थी। ',
 'वर्ग का पूर्णा क्या था? ',
 'मैं अपना काम कर चुका हूँ। ',
 'राष्ट्रीय मनः स्वास्थ्य कार्यक्रम',
 'क्रियावली',
 'मुद्रास्फीति की दर प्रत्यक्ष रूप से उपरली डिग्री में है। ',
 'वाहिका में रूधिर अवपंक रक्त प्रवाह में अवरोध का कारण है',
 'उपकरण',
 'हरकत-उल-जिहाद-ए-इस्लामी',
 'URL …']

In [10]:
src_texts[:10]

['4 days after the massive tsunami struck japan, hopes of finding anyone still alive were fading.',
 'what was completing the square?',
 'i have already done my work.',
 'national mental health programme',
 'menu',
 'the inflation rate is apparently in the ascending degree.',
 'a sludge of blood in the vessel causes absruction to blood flow.',
 'device',
 'harkat - ul - jihad al - islami',
 'url …']

## Add `<SOS>` and `<EOS>` to TARGET

In [11]:
# Add special tokens
src_texts = [f"<sos> {text} <eos>" for text in src_texts]
tgt_texts = [f"<sos> {text} <eos>" for text in tgt_texts]


In [12]:
src_texts[:10]

['<sos> 4 days after the massive tsunami struck japan, hopes of finding anyone still alive were fading. <eos>',
 '<sos> what was completing the square? <eos>',
 '<sos> i have already done my work. <eos>',
 '<sos> national mental health programme <eos>',
 '<sos> menu <eos>',
 '<sos> the inflation rate is apparently in the ascending degree. <eos>',
 '<sos> a sludge of blood in the vessel causes absruction to blood flow. <eos>',
 '<sos> device <eos>',
 '<sos> harkat - ul - jihad al - islami <eos>',
 '<sos> url … <eos>']

In [13]:
tgt_texts[:10]

['<sos> बडे पैमाने पर सुनामी से प्रभावीत जापान में 4 दिनो बाद कोई अभी तक जिंदा होने की आशाएँ लुप्त हो रही थी।  <eos>',
 '<sos> वर्ग का पूर्णा क्या था?  <eos>',
 '<sos> मैं अपना काम कर चुका हूँ।  <eos>',
 '<sos> राष्ट्रीय मनः स्वास्थ्य कार्यक्रम <eos>',
 '<sos> क्रियावली <eos>',
 '<sos> मुद्रास्फीति की दर प्रत्यक्ष रूप से उपरली डिग्री में है।  <eos>',
 '<sos> वाहिका में रूधिर अवपंक रक्त प्रवाह में अवरोध का कारण है <eos>',
 '<sos> उपकरण <eos>',
 '<sos> हरकत-उल-जिहाद-ए-इस्लामी <eos>',
 '<sos> URL … <eos>']

## Tokenization (Keras)

In [14]:
src_tokenizer = Tokenizer(filters='0123456789!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~', oov_token="<unk>")
tgt_tokenizer = Tokenizer(filters='0123456789!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~', oov_token="<unk>")

# Force <pad> to exist
src_tokenizer.fit_on_texts(["<pad>"] + src_texts)
tgt_tokenizer.fit_on_texts(["<pad>"] + tgt_texts)


In [15]:
src_tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x27403701a90>

In [16]:
tgt_tokenizer.word_index["<pad>"]

5022

In [17]:
src_tokenizer.word_index["<pad>"]

4638

In [18]:
tgt_tokenizer.word_index["<pad>"]=0

In [19]:
src_tokenizer.word_index["<pad>"]=0

In [20]:
tgt_tokenizer.word_index["<pad>"]

0

In [21]:
src_tokenizer.word_index["<pad>"]

0

## Convert to Sequences + Padding

In [22]:
MAX_SRC_LEN = max(len(seq) for seq in tgt_texts)
MAX_TGT_LEN = max(len(seq) for seq in tgt_texts)

src_seq = src_tokenizer.texts_to_sequences(src_texts)
tgt_seq = tgt_tokenizer.texts_to_sequences(tgt_texts)

src_padded = pad_sequences(src_seq, maxlen=MAX_SRC_LEN, padding="post")
tgt_padded = pad_sequences(tgt_seq, maxlen=MAX_TGT_LEN, padding="post")
tgt_tokenizer.word_index["<pad>"]

0

In [23]:
tgt_seq[:5]

[[2,
  2356,
  2357,
  14,
  5023,
  9,
  5024,
  1844,
  6,
  5025,
  78,
  38,
  307,
  55,
  5026,
  94,
  8,
  5027,
  5028,
  22,
  178,
  208,
  3],
 [2, 665, 11, 5029, 45, 47, 3],
 [2, 70, 131, 111, 28, 439, 564, 3],
 [2, 156, 5030, 412, 375, 3],
 [2, 5031, 3]]

In [24]:
src_seq[:5]

[[2,
  317,
  72,
  4,
  4639,
  4640,
  4641,
  1709,
  2196,
  5,
  1710,
  849,
  191,
  1404,
  56,
  4642,
  3],
 [2, 52, 31, 4643, 4, 1160, 3],
 [2, 39, 23, 288, 399, 87, 152, 3],
 [2, 120, 1711, 400, 466, 3],
 [2, 1405, 3]]

In [25]:
src_padded[:10]

array([[   2,  317,   72, ...,    0,    0,    0],
       [   2,   52,   31, ...,    0,    0,    0],
       [   2,   39,   23, ...,    0,    0,    0],
       ...,
       [   2, 1161,    3, ...,    0,    0,    0],
       [   2, 4647, 3014, ...,    0,    0,    0],
       [   2,  755, 3016, ...,    0,    0,    0]], dtype=int32)

In [26]:
tgt_padded[:10]

array([[   2, 2356, 2357, ...,    0,    0,    0],
       [   2,  665,   11, ...,    0,    0,    0],
       [   2,   70,  131, ...,    0,    0,    0],
       ...,
       [   2, 1129,    3, ...,    0,    0,    0],
       [   2, 5037, 2360, ...,    0,    0,    0],
       [   2, 3205, 2361, ...,    0,    0,    0]], dtype=int32)

## Vocabulary Sizes + Special Indices

In [27]:
SRC_VOCAB_SIZE = len(src_tokenizer.word_index) + 1
TGT_VOCAB_SIZE = len(tgt_tokenizer.word_index) + 1

PAD_IDX = tgt_tokenizer.word_index["<pad>"]
SOS_IDX = tgt_tokenizer.word_index["<sos>"]
EOS_IDX = tgt_tokenizer.word_index["<eos>"]


In [28]:
#src_tokenizer.word_index

In [29]:
#tgt_tokenizer.word_index

## Custom Dataset Class

In [30]:
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = torch.tensor(src, dtype=torch.long)
        self.tgt = torch.tensor(tgt, dtype=torch.long)

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]


## DataLoader

In [31]:
dataset = TranslationDataset(src_padded, tgt_padded)

loader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True
)


## Encoder Decoder classes

In [32]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=3, batch_first=True)
 
    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


In [33]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=3, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)  # (batch, 1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell


In [34]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len - 1, vocab_size).to(device)

        hidden, cell = self.encoder(src)

        input = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t - 1] = output
            input = tgt[:, t]  # teacher forcing

        return outputs


## model initialzation

In [35]:
EMBED_DIM = 100
HIDDEN_DIM = 100


In [36]:
encoder = Encoder(SRC_VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to(device)
decoder = Decoder(TGT_VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to(device)

model = Seq2Seq(encoder, decoder).to(device)

In [37]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(11062, 100, padding_idx=0)
    (lstm): LSTM(100, 100, num_layers=3, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(13221, 100, padding_idx=0)
    (lstm): LSTM(100, 100, num_layers=3, batch_first=True)
    (fc): Linear(in_features=100, out_features=13221, bias=True)
  )
)

## training

In [38]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [39]:
EPOCHS = 2

In [40]:
%%time
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    i=0

    for src_batch, tgt_batch in loader:
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)

        optimizer.zero_grad()

        outputs = model(src_batch, tgt_batch)

        loss = criterion(
            outputs.reshape(-1, outputs.shape[-1]),
            tgt_batch[:, 1:].reshape(-1)
        )

        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        i+=1
        if i % 100 == 0:
            print(i,'/',320)

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss:.4f}")


20 / 320
40 / 320
60 / 320
80 / 320
100 / 320
120 / 320
140 / 320
160 / 320
180 / 320
200 / 320
220 / 320
240 / 320
260 / 320
280 / 320
300 / 320
Epoch 1/2 | Loss: 2364.3392
20 / 320
40 / 320
60 / 320
80 / 320
100 / 320
120 / 320
140 / 320
160 / 320
180 / 320
200 / 320
220 / 320
240 / 320
260 / 320
280 / 320
300 / 320
Epoch 2/2 | Loss: 2203.7608
CPU times: total: 50min 29s
Wall time: 4h 4min 16s


## pridiction functiion

In [79]:
def translate(sentence, max_len=100):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        seq = src_tokenizer.texts_to_sequences(
            [f"<sos> {sentence.lower()} <eos>"]
        )
        seq = pad_sequences(seq, maxlen=MAX_SRC_LEN, padding="post")
        src = torch.tensor(seq, dtype=torch.long).to(device)

        hidden, cell = encoder(src)

        input_token = torch.tensor([SOS_IDX], dtype=torch.long).to(device)
        result = []

        for _ in range(max_len):
            output, hidden, cell = decoder(input_token, hidden, cell)

            token = output.argmax(dim=1).item()
            #print(token,tgt_tokenizer.index_word[token])

            # ---- STOP condition ----
            #if token == EOS_IDX:
                #break

            # ---- SKIP PAD safely ----
            if token == PAD_IDX:
                input_token = torch.tensor([SOS_IDX], dtype=torch.long).to(device)
                #print('pad')
                continue

            result.append(tgt_tokenizer.index_word[token])
            input_token = torch.tensor([token], dtype=torch.long).to(device)

    return " ".join(result)


In [80]:
translate("the inflation rate is apparently in the ascending degree")

'और <eos> <eos> के के के लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए लिए और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और'

## Save model

In [81]:
from pathlib import Path
root=Path("model")
root.mkdir(exist_ok=True)
path= root / "eng_to_hindi_encoder_decoder.pth"
torch.save(model,path)

In [83]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import os

os.makedirs("model", exist_ok=True)

tokenizer_json = src_tokenizer.to_json()

with open("model/src_tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer_json)

tgt_tokenizer_json = tgt_tokenizer.to_json()

with open("model/tgt_tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tgt_tokenizer_json)


# Development : more training

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
#device='cpu'

Using device: cuda


In [3]:
MAX_SRC_LEN=1063
#by first block

In [4]:
MAX_TGT_LEN =1063
#by first block

In [5]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

with open("model/src_tokenizer.json", "r", encoding="utf-8") as f:
    src_tokenizer = tokenizer_from_json(f.read())

with open("model/tgt_tokenizer.json", "r", encoding="utf-8") as f:
    tgt_tokenizer = tokenizer_from_json(f.read())


In [6]:
SRC_VOCAB_SIZE = len(src_tokenizer.word_index) + 1
TGT_VOCAB_SIZE = len(tgt_tokenizer.word_index) + 1

PAD_IDX = tgt_tokenizer.word_index["<pad>"]
SOS_IDX = tgt_tokenizer.word_index["<sos>"]
EOS_IDX = tgt_tokenizer.word_index["<eos>"]


## Load model

In [7]:
EMBED_DIM = 100
HIDDEN_DIM = 100

In [8]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=3, batch_first=True)
 
    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


In [9]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=3, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)  # (batch, 1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell


In [10]:
PAD_IDX=0

In [11]:
encoder = Encoder(SRC_VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to(device)
decoder = Decoder(TGT_VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to(device)

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len - 1, vocab_size).to(device)

        hidden, cell = self.encoder(src)

        input = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t - 1] = output
            input = tgt[:, t]  # teacher forcing

        return outputs


In [13]:
from pathlib import Path
import torch

model_dir = Path("model")
load_path = model_dir / "eng_to_hindi_encoder_decoder.pth"

# tell torch to allow full unpickling (risky if file is untrusted)
model = torch.load(load_path, weights_only=False)
model.eval()


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(11062, 100, padding_idx=0)
    (lstm): LSTM(100, 100, num_layers=3, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(13221, 100, padding_idx=0)
    (lstm): LSTM(100, 100, num_layers=3, batch_first=True)
    (fc): Linear(in_features=100, out_features=13221, bias=True)
  )
)

## pridiction functiion

In [14]:
def translate(sentence, max_len=100):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        seq = src_tokenizer.texts_to_sequences(
            [f"<sos> {sentence.lower()} <eos>"]
        )
        seq = pad_sequences(seq, maxlen=MAX_SRC_LEN, padding="post")
        src = torch.tensor(seq, dtype=torch.long).to(device)

        hidden, cell = encoder(src)

        input_token = torch.tensor([SOS_IDX], dtype=torch.long).to(device)
        result = []

        for _ in range(max_len):
            output, hidden, cell = decoder(input_token, hidden, cell)

            token = output.argmax(dim=1).item()
            #print(token,tgt_tokenizer.index_word[token])

            # ---- STOP condition ----
            #if token == EOS_IDX:
                #break

            # ---- SKIP PAD safely ----
            if token == PAD_IDX:
                input_token = torch.tensor([SOS_IDX], dtype=torch.long).to(device)
                #print('pad')
                continue

            result.append(tgt_tokenizer.index_word[token])
            input_token = torch.tensor([token], dtype=torch.long).to(device)

    return " ".join(result)


In [15]:
translate("the inflation rate is apparently in the ascending degree")

'गुर्दे अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी अनुषंगी'

## Retrain

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)


In [None]:
EPOCHS = 2

In [None]:
%%time
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    i=0

    for src_batch, tgt_batch in loader:
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)

        optimizer.zero_grad()

        outputs = model(src_batch, tgt_batch)

        loss = criterion(
            outputs.reshape(-1, outputs.shape[-1]),
            tgt_batch[:, 1:].reshape(-1)
        )

        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        i+=1
        if i % 20 == 0:
            print(i,'/',320)

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss:.4f}")
