In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter
import random

device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
df = pd.read_csv("/content/train.csv" , engine="python", on_bad_lines="skip", nrows=200_000)

questions = []
for q1, q2 in zip(df["question1"], df["question2"]):
    if isinstance(q1, str):
        questions.append(q1.lower())
    if isinstance(q2, str):
        questions.append(q2.lower())

# LIMIT DATA
questions = questions[:150_000]

print("Total questions:", len(questions))

Total questions: 150000


In [5]:
VOCAB_SIZE = 8000

counter = Counter()
for q in questions:
    counter.update(q.split())

vocab = [w for w, _ in counter.most_common(VOCAB_SIZE)]

word2idx = {w: i+1 for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

In [6]:
SEQ_LEN = 4

tokens = []
for q in questions:
    words = [word2idx[w] for w in q.split() if w in word2idx]
    if len(words) >= SEQ_LEN + 1:
        tokens.extend(words)

print("Total tokens:", len(tokens))

Total tokens: 1456003


In [7]:
class WordDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        x = self.tokens[idx:idx+self.seq_len]
        y = self.tokens[idx+self.seq_len]
        return torch.tensor(x), torch.tensor(y)

In [8]:
train_loader = DataLoader(
    WordDataset(tokens, SEQ_LEN),
    batch_size=32,
    shuffle=True,
    num_workers=0
)

In [9]:
class WordLSTM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size + 1, 128)
        self.lstm = nn.LSTM(128, 256, batch_first=True)
        self.fc = nn.Linear(256, vocab_size + 1)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        return self.fc(out[:, -1])

In [10]:
model = WordLSTM(len(word2idx)).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss()

EPOCHS = 3

In [11]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for x, y in train_loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        loss = criterion(model(x), y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/3 | Loss: 4.6446
Epoch 2/3 | Loss: 4.2901
Epoch 3/3 | Loss: 4.2349


In [12]:
def autocomplete(prompt, num_words=3, temperature=0.7, top_k=5):
    model.eval()
    words = prompt.lower().split()

    for _ in range(num_words):
        seq = [word2idx.get(w, 0) for w in words[-SEQ_LEN:]]
        seq = [0]*(SEQ_LEN-len(seq)) + seq

        x = torch.tensor(seq).unsqueeze(0).to(device)

        with torch.no_grad():
            logits = model(x) / temperature
            probs = F.softmax(logits, dim=-1)
            top_probs, top_idx = torch.topk(probs, top_k)

            next_word = random.choices(
                top_idx[0].tolist(),
                weights=top_probs[0].tolist()
            )[0]

        words.append(idx2word.get(next_word, ""))

    return " ".join(words)

In [13]:
print(autocomplete("how to learn", 3))
print(autocomplete("what is machine", 2))
print(autocomplete("best way to", 3))

how to learn how to play
what is machine learning programming
best way to get a job


In [14]:
torch.save({
    "model_state": model.state_dict(),
    "word2idx": word2idx,
    "idx2word": idx2word
}, "autocomplete_model.pth")