In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random


In [None]:
# Loading and preprocess text

with open("next_word_predictor.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

# Removing extra symbols
import re
text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
words = text.split()


# Creating vocabulary

vocab = sorted(set(words))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

print("Total unique words:", len(vocab))

Total unique words: 4966


In [None]:
# Dataset class

class WordDataset(Dataset):
    def __init__(self, words, seq_len=3):
        self.seq_len = seq_len
        self.data = [word_to_idx[w] for w in words]

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx+self.seq_len])
        y = torch.tensor(self.data[idx+self.seq_len])
        return x, y

dataset = WordDataset(words, seq_len=3)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:
# RNN model

class WordRNN(nn.Module):
    def __init__(self, vocab_size, embed_size=32, hidden_size=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # last hidden state
        return out


In [None]:
print("Sample words:", words[:50])
print("Vocabulary size:", len(vocab))

Sample words: ['the', 'sun', 'was', 'shining', 'brightly', 'in', 'the', 'clear', 'blue', 'sky', 'and', 'a', 'gentle', 'breeze', 'rustled', 'the', 'leaves', 'of', 'the', 'tall', 'trees', 'people', 'were', 'out', 'enjoying', 'the', 'beautiful', 'weather', 'some', 'sitting', 'in', 'the', 'park', 'others', 'taking', 'a', 'leisurely', 'stroll', 'along', 'the', 'riverbank', 'children', 'were', 'playing', 'games', 'and', 'laughter', 'filled', 'the', 'air']
Vocabulary size: 4966


In [None]:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)


tensor(2.0552, device='cuda:0')

In [None]:
# Training setup

device = "cuda" if torch.cuda.is_available() else "cpu"
model = WordRNN(len(vocab)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(30):
    total_loss = 0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")


Epoch 1, Loss: 7.1016
Epoch 2, Loss: 6.2200
Epoch 3, Loss: 5.7076
Epoch 4, Loss: 5.3071
Epoch 5, Loss: 4.9663
Epoch 6, Loss: 4.6598
Epoch 7, Loss: 4.3811
Epoch 8, Loss: 4.1197
Epoch 9, Loss: 3.8807
Epoch 10, Loss: 3.6557
Epoch 11, Loss: 3.4475
Epoch 12, Loss: 3.2584
Epoch 13, Loss: 3.0866
Epoch 14, Loss: 2.9259
Epoch 15, Loss: 2.7775
Epoch 16, Loss: 2.6409
Epoch 17, Loss: 2.5228
Epoch 18, Loss: 2.4136
Epoch 19, Loss: 2.3167
Epoch 20, Loss: 2.2277
Epoch 21, Loss: 2.1501
Epoch 22, Loss: 2.0771
Epoch 23, Loss: 2.0084
Epoch 24, Loss: 1.9457
Epoch 25, Loss: 1.9003
Epoch 26, Loss: 1.8519
Epoch 27, Loss: 1.8017
Epoch 28, Loss: 1.7633
Epoch 29, Loss: 1.7218
Epoch 30, Loss: 1.6877


In [1]:
#  Prediction

def predict_next_word(model, text_seq, top_k=3):
    model.eval()
    seq = text_seq.lower().split()
    try:
        x = torch.tensor([[word_to_idx[w] for w in seq]], device=device)
    except KeyError:
        return "Unknown word in input."
    with torch.no_grad():
        out = model(x)
        probs = torch.softmax(out, dim=1).cpu()
        top_idx = torch.topk(probs, top_k).indices[0]
    return [idx_to_word[i.item()] for i in top_idx]


In [None]:
# Test Predictions

samples = [
    "the sun was shining",
    "Hello how are",
    "his voice barely",
    "deep in the amazon",
    "quantum computing breakthrough"
]

print("\nPredictions:")
for s in samples:
    print(f"{s} → {predict_next_word(model, s)}")


Predictions:
the sun was shining → ['brightly', 'talking', 'data']
Hello how are → ['you', 'growing', 'i']
his voice barely → ['above', 'voice', 'takes']
deep in the amazon → ['for', 'at', 'news']
quantum computing breakthrough → ['paves', 'as', 'argue']
