##Anjali Kumari

##25/AFI/17

In [None]:
!pip install torch torchvision torchaudio pandas scikit-learn tqdm -q


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [None]:
file_path = "spa.txt"

with open(file_path, "r", encoding="utf-8") as f:
    texts = f.readlines()

texts = [t.strip() for t in texts if t.strip()]

print("Total samples:", len(texts))
print("Sample:", texts[:3])


Total samples: 118964
Sample: ['Go.\tVe.', 'Go.\tVete.', 'Go.\tVaya.']


In [None]:
train_texts = texts[:80000]
val_texts = texts[80000:90000]
test_texts = texts[90000:100000]

print(len(train_texts), len(val_texts), len(test_texts))


80000 10000 10000


In [None]:
from collections import Counter

def tokenize(text):
    return text.lower().split()

counter = Counter()

for text in train_texts:
    counter.update(tokenize(text))

vocab = {word: i+2 for i, (word, _) in enumerate(counter.most_common())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

vocab_size = len(vocab)
print("Vocab size:", vocab_size)


Vocab size: 43556


In [None]:
MAX_LEN = 20

def encode(text):
    tokens = tokenize(text)
    ids = [vocab.get(tok, 1) for tok in tokens][:MAX_LEN]
    if len(ids) < MAX_LEN:
        ids += [0] * (MAX_LEN - len(ids))
    return ids


In [None]:
class TextDataset(Dataset):
    def __init__(self, texts):
        self.data = [encode(t) for t in texts]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx][:-1])
        y = torch.tensor(self.data[idx][1:])
        return x, y


In [None]:
BATCH_SIZE = 128

train_loader = DataLoader(TextDataset(train_texts), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TextDataset(val_texts), batch_size=BATCH_SIZE)
test_loader = DataLoader(TextDataset(test_texts), batch_size=BATCH_SIZE)


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out

model = LSTMModel(vocab_size).to(device)
print(model)


LSTMModel(
  (embedding): Embedding(43556, 128)
  (lstm): LSTM(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=43556, bias=True)
)


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for x, y in tqdm(train_loader):
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        output = model(x)

        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


100%|██████████| 625/625 [00:41<00:00, 14.91it/s]


Epoch 1, Loss: 6.9466


100%|██████████| 625/625 [00:43<00:00, 14.36it/s]


Epoch 2, Loss: 5.1664


100%|██████████| 625/625 [00:45<00:00, 13.69it/s]


Epoch 3, Loss: 4.2719


100%|██████████| 625/625 [00:44<00:00, 13.96it/s]


Epoch 4, Loss: 3.6771


100%|██████████| 625/625 [00:45<00:00, 13.83it/s]


Epoch 5, Loss: 3.2332


100%|██████████| 625/625 [00:45<00:00, 13.85it/s]


Epoch 6, Loss: 2.8863


100%|██████████| 625/625 [00:44<00:00, 13.91it/s]


Epoch 7, Loss: 2.6158


100%|██████████| 625/625 [00:45<00:00, 13.87it/s]


Epoch 8, Loss: 2.4053


100%|██████████| 625/625 [00:45<00:00, 13.81it/s]


Epoch 9, Loss: 2.2405


100%|██████████| 625/625 [00:45<00:00, 13.83it/s]


Epoch 10, Loss: 2.1066


100%|██████████| 625/625 [00:45<00:00, 13.83it/s]


Epoch 11, Loss: 1.9951


100%|██████████| 625/625 [00:45<00:00, 13.85it/s]


Epoch 12, Loss: 1.9022


100%|██████████| 625/625 [00:45<00:00, 13.87it/s]


Epoch 13, Loss: 1.8215


100%|██████████| 625/625 [00:45<00:00, 13.87it/s]


Epoch 14, Loss: 1.7519


100%|██████████| 625/625 [00:45<00:00, 13.88it/s]


Epoch 15, Loss: 1.6907


100%|██████████| 625/625 [00:45<00:00, 13.87it/s]


Epoch 16, Loss: 1.6368


100%|██████████| 625/625 [00:45<00:00, 13.85it/s]


Epoch 17, Loss: 1.5885


100%|██████████| 625/625 [00:45<00:00, 13.88it/s]


Epoch 18, Loss: 1.5458


100%|██████████| 625/625 [00:45<00:00, 13.82it/s]


Epoch 19, Loss: 1.5072


100%|██████████| 625/625 [00:45<00:00, 13.75it/s]

Epoch 20, Loss: 1.4720





In [None]:
model.eval()
val_loss = 0

with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        val_loss += loss.item()

print("Validation Loss:", val_loss / len(val_loader))


Validation Loss: 5.503110586842404


In [None]:
model.eval()
test_loss = 0

with torch.no_grad():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        test_loss += loss.item()

print("Test Loss:", test_loss / len(test_loader))


Test Loss: 6.118708459636833


In [None]:
def predict_next(text):
    model.eval()
    encoded = torch.tensor([encode(text)[:-1]]).to(device)

    with torch.no_grad():
        output = model(encoded)
        pred = output.argmax(dim=-1)

    inv_vocab = {v: k for k, v in vocab.items()}
    words = [inv_vocab.get(i.item(), "<UNK>") for i in pred[0]]

    return " ".join(words)

print(predict_next("this is a simple"))


is a very of me. fish. two. his. his. his. his. error error error error así así así así
