In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pandas as pd
import kagglehub

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("devicharith/language-translation-englishfrench")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'language-translation-englishfrench' dataset.
Path to dataset files: /kaggle/input/language-translation-englishfrench


In [None]:
file_path = os.path.join(path, "eng_-french.csv")

with open(file_path, "r", encoding="utf-8") as f:
    pairs = f.readlines()

In [None]:
# synthetic dataset
pairs = [
    ("input1", "translated_output1"),
    ("input2", "translated_output2"),
    ("input3", "translated_output3"),
    ("input4", "translated_output4"),
    ("input5", "translated_output5"),
    ("input6", "translated_output6"),
]

In [None]:
PAD = "<pad>"
SOS = "<sos>"
EOS = "<eos>"

def tokenize(s):
    return s.lower().split()

class Vocab:
    def __init__(self):
        self.stoi = {PAD:0, SOS:1, EOS:2}
        self.itos = {0:PAD, 1:SOS, 2:EOS}

    def build(self, sentences):
        idx = 3
        for s in sentences:
            for tok in tokenize(s):
                if tok not in self.stoi:
                    self.stoi[tok] = idx
                    self.itos[idx] = tok
                    idx += 1

    def encode(self, s):
        return [self.stoi[t] for t in tokenize(s)]

In [None]:
src_vocab = Vocab()
tgt_vocab = Vocab()

src_vocab.build([p[0] for p in pairs])
tgt_vocab.build([p[1] for p in pairs])

def make_tensors(pairs):
    data = []
    for src, tgt in pairs:
        src_ids = src_vocab.encode(src)
        tgt_ids = [tgt_vocab.stoi[SOS]] + tgt_vocab.encode(tgt) + [tgt_vocab.stoi[EOS]]
        data.append((src_ids, tgt_ids))
    return data

data = make_tensors(pairs)

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.emb(x)
        _, (h, c) = self.lstm(emb)
        return h, c

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, h, c):
        emb = self.emb(x)
        out, (h, c) = self.lstm(emb, (h, c))
        logits = self.fc(out)
        return logits, h, c

In [None]:
encoder = Encoder(len(src_vocab.stoi), 64, 128)
decoder = Decoder(len(tgt_vocab.stoi), 64, 128)

criterion = nn.CrossEntropyLoss()
enc_opt = optim.Adam(encoder.parameters(), lr=0.01)
dec_opt = optim.Adam(decoder.parameters(), lr=0.01)

for epoch in range(300):
    total_loss = 0
    random.shuffle(data)

    for src_ids, tgt_ids in data:
        src = torch.tensor([src_ids])
        tgt = torch.tensor([tgt_ids])

        enc_opt.zero_grad()
        dec_opt.zero_grad()

        # ---- ENCODER ----
        h, c = encoder(src)

        # ---- DECODER ----
        loss = 0
        x = tgt[:, 0].unsqueeze(1)  # <sos>

        for t in range(1, tgt.size(1)):
            logits, h, c = decoder(x, h, c)
            loss += criterion(logits.squeeze(1), tgt[:, t])
            x = tgt[:, t].unsqueeze(1)  # teacher forcing

        loss.backward()
        enc_opt.step()
        dec_opt.step()

        total_loss += loss.item()

    if epoch % 50 == 0:
        print(f"Epoch {epoch} | Loss: {total_loss:.2f}")

Epoch 0 | Loss: 24.12
Epoch 50 | Loss: 0.00
Epoch 100 | Loss: 0.00
Epoch 150 | Loss: 0.00
Epoch 200 | Loss: 0.00
Epoch 250 | Loss: 0.00


In [None]:
def translate(sentence):
    encoder.eval()
    decoder.eval()

    src_ids = src_vocab.encode(sentence)
    src = torch.tensor([src_ids])

    with torch.no_grad():
        h, c = encoder(src)

    x = torch.tensor([[tgt_vocab.stoi[SOS]]])
    result = []

    for _ in range(10):
        with torch.no_grad():
            logits, h, c = decoder(x, h, c)
        pred = logits.argmax(-1).item()

        if pred == tgt_vocab.stoi[EOS]:
            break

        result.append(tgt_vocab.itos[pred])
        x = torch.tensor([[pred]])

    return " ".join(result)

In [None]:
print(translate("input1"))
print(translate("input2"))
print(translate("input3"))

translated_output1
translated_output2
translated_output3
