In [None]:
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
from datasets import load_dataset
ds = load_dataset("cfilt/iitb-english-hindi")
print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

dataset_infos.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})


In [None]:
train_raw = ds["train"].select(range(150000))  # ~150k pairs
valid_raw = ds["validation"]
test_raw  = ds["test"]

In [None]:
import sentencepiece as spm
import pathlib

# Write text files for tokenizer training
path = pathlib.Path("tokenizer_data")
path.mkdir(exist_ok=True)

with open(path/"train.en", "w", encoding="utf-8") as f_en, \
     open(path/"train.hi", "w", encoding="utf-8") as f_hi:
    for item in train_raw:
        f_en.write(item["translation"]["en"] + "\n")
        f_hi.write(item["translation"]["hi"] + "\n")

In [None]:
spm.SentencePieceTrainer.Train(
    input=f"{path/'train.en'},{path/'train.hi'}",
    model_prefix="bpe",
    vocab_size=8000,
    character_coverage=1.0,
    model_type="bpe"
)

In [None]:
sp = spm.SentencePieceProcessor()
sp.load("bpe.model")

True

In [None]:
def encode_pair(example):
    src = example["translation"]["en"]
    tgt = example["translation"]["hi"]

    src_ids = sp.encode(src, out_type=int)
    tgt_ids = [1] + sp.encode(tgt, out_type=int) + [2]  # e.g., <s>=1, </s>=2

    return {"src_ids": src_ids, "tgt_ids": tgt_ids}

encoded_train = train_raw.map(encode_pair)
encoded_valid = valid_raw.map(encode_pair)
encoded_test  = test_raw.map(encode_pair)

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [None]:
encoded_train.set_format(type="torch", columns=["src_ids", "tgt_ids"])

In [None]:
import torch

def collate_fn(batch):
    src = [torch.tensor(b["src_ids"]) for b in batch]
    tgt = [torch.tensor(b["tgt_ids"]) for b in batch]

    src_pad = torch.nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=0)
    tgt_pad = torch.nn.utils.rnn.pad_sequence(tgt, batch_first=True, padding_value=0)

    return src_pad, tgt_pad

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64   # You can adjust later based on GPU memory

train_loader = DataLoader(
    encoded_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)

valid_loader = DataLoader(
    encoded_valid,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    encoded_test,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (h, c) = self.lstm(embedded)
        return h, c

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, tgt, h, c):
        embedded = self.embedding(tgt)
        outputs, (h, c) = self.lstm(embedded, (h, c))
        logits = self.fc(outputs)
        return logits, h, c

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        h, c = self.encoder(src)
        logits, _, _ = self.decoder(tgt[:, :-1], h, c)
        return logits

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Seq2Seq(
    Encoder(vocab_size=len(sp), embed_dim=256, hidden_dim=512),
    Decoder(vocab_size=len(sp), embed_dim=256, hidden_dim=512)
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [None]:
def train_epoch(model, loader):
    model.train()
    total_loss = 0

    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        logits = model(src, tgt)
        loss = criterion(
            logits.reshape(-1, logits.size(-1)),
            tgt[:, 1:].reshape(-1)
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

In [None]:
@torch.no_grad()
def eval_epoch(model, loader):
    model.eval()
    total_loss = 0

    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)

        logits = model(src, tgt)   # still teacher-forced, but no grad
        loss = criterion(
            logits.reshape(-1, logits.size(-1)),
            tgt[:, 1:].reshape(-1)
        )

        total_loss += loss.item()

    return total_loss / len(loader)

In [None]:
EPOCHS = 10  # start small, adjust later
for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader)
    valid_loss = eval_epoch(model, valid_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Valid Loss: {valid_loss:.4f}")

  src = [torch.tensor(b["src_ids"]) for b in batch]
  tgt = [torch.tensor(b["tgt_ids"]) for b in batch]


Epoch 1/10 | Train Loss: 3.4492 | Valid Loss: 6.8232
Epoch 2/10 | Train Loss: 1.3209 | Valid Loss: 7.3515
Epoch 3/10 | Train Loss: 0.6832 | Valid Loss: 7.8264
Epoch 4/10 | Train Loss: 0.4405 | Valid Loss: 8.2980
Epoch 5/10 | Train Loss: 0.3245 | Valid Loss: 8.7275
Epoch 6/10 | Train Loss: 0.2578 | Valid Loss: 9.1233
Epoch 7/10 | Train Loss: 0.2166 | Valid Loss: 9.4467
Epoch 8/10 | Train Loss: 0.1882 | Valid Loss: 9.8761
Epoch 9/10 | Train Loss: 0.1687 | Valid Loss: 10.1982
Epoch 10/10 | Train Loss: 0.1563 | Valid Loss: 10.4863


In [None]:
@torch.no_grad()
def translate(sentence):
    model.eval()

    # tokenize English
    src_ids = sp.encode(sentence, out_type=int)
    src = torch.tensor(src_ids).unsqueeze(0).to(device)

    # encode
    h, c = model.encoder(src)

    # start with <s> token (usually id=1)
    y = torch.tensor([1], device=device).unsqueeze(0)

    output_ids = []

    for _ in range(50):  # max length
        logits, h, c = model.decoder(y, h, c)
        next_id = logits[:, -1].argmax(dim=-1)
        token = next_id.item()

        if token == 2:  # </s>
            break

        output_ids.append(token)

        y = torch.cat([y, next_id.unsqueeze(0)], dim=1)

    return sp.decode(output_ids)

In [None]:
print(translate("How are you?"))
print(translate("What is your name?"))
print(translate("India is a beautiful country."))
print(translate("I love deep learning."))

à¤ ̈à¤3⁄4à¤® à¤¦à¤¿à¤à¤3⁄4à¤à¤
क्यावान पदावनत है
à¤μà¤¿à¤ ̧à¥à¤à¥à¤à¥à¤°à¤3⁄4à¤® à¤à¥ à¤ ̧à¤à¤aà¤3⁄4à¤¦à¤¿à¤¤ à¤à¤°à¤ ̈à¥ à
टिफआई बनाएं
