In [31]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [32]:
with open('/content/Sherlock Holmes.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [33]:
import re

text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)


In [34]:
tokens = text.split()
print("Total words:", len(tokens))


Total words: 104432


In [35]:
from collections import Counter

counter = Counter(tokens)

vocab = {"<UNK>": 0}
for word in counter:
    vocab[word] = len(vocab)

idx2word = {i: w for w, i in vocab.items()}
vocab_size = len(vocab)

print("Vocab size:", vocab_size)


Vocab size: 8338


In [36]:
encoded = [vocab[word] for word in tokens]


In [37]:
SEQ_LEN = 4

X = []
y = []

for i in range(len(encoded) - SEQ_LEN):
    X.append(encoded[i:i+SEQ_LEN])
    y.append(encoded[i+SEQ_LEN])


In [38]:
from torch.utils.data import Dataset

class NextWordDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [39]:
from torch.utils.data import DataLoader

dataset = NextWordDataset(X, y)

loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True
)


In [40]:
import torch.nn as nn

class NextWordModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out


In [41]:
EMBED_DIM = 100
HIDDEN_DIM = 128

model = NextWordModel(vocab_size, EMBED_DIM, HIDDEN_DIM).to(device)


In [42]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [43]:
EPOCHS = 5

for epoch in range(EPOCHS):
    total_loss = 0
    model.train()

    for x_batch, y_batch in loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss/len(loader):.4f}")


Epoch 1/5 | Loss: 6.2016
Epoch 2/5 | Loss: 5.4121
Epoch 3/5 | Loss: 5.0101
Epoch 4/5 | Loss: 4.6714
Epoch 5/5 | Loss: 4.3650


In [47]:
import pickle
# Save model weights
torch.save(model.state_dict(), "nextword_model.pt")

# Save vocab dictionary
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

# Save reverse vocab
with open("idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)
print("Files saved: nextword_model.pt, vocab.pkl, idx2word.pkl")


Files saved: nextword_model.pt, vocab.pkl, idx2word.pkl


In [44]:
def predict_next_word(text):
    model.eval()

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()[-SEQ_LEN:]

    if len(words) < SEQ_LEN:
        return "Not enough context"

    encoded = torch.tensor([[vocab[w] for w in words]]).to(device)

    with torch.no_grad():
        output = model(encoded)
        pred_idx = output.argmax(dim=1).item()

    return idx2word[pred_idx]


In [45]:
print(predict_next_word("to sherlock holmes"))
print(predict_next_word("he was a man"))


Not enough context
of
