In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import random

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [None]:
# Config
WORDS_FILE = "words.txt"
BATCH_SIZE = 128
EPOCHS = 10
MAX_LEN = 20
EMBED_DIM = 128
NUM_LAYERS = 3
NUM_HEADS = 4
LR = 3e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

PAD = "[PAD]"
MASK = "[MASK]"

In [None]:
# Load words
words = [w.strip() for w in Path(WORDS_FILE).read_text(encoding="utf-8").splitlines()]
words = [w for w in words if 1 < len(w) <= MAX_LEN]

FileNotFoundError: [Errno 2] No such file or directory: 'words.txt'

In [None]:

# Build vocab

chars = sorted(set("".join(words)))
itos = [PAD, MASK] + chars
stoi = {c: i for i, c in enumerate(itos)}

PAD_ID = stoi[PAD]
MASK_ID = stoi[MASK]
VOCAB_SIZE = len(itos)

In [None]:

# Dataset

class MaskedWordDataset(Dataset):
    def __init__(self, words):
        self.samples = []
        for w in words:
            for i in range(len(w)):
                self.samples.append((w, i))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        word, mask_idx = self.samples[idx]

        chars = list(word)
        target_char = chars[mask_idx]
        chars[mask_idx] = MASK

        ids = [stoi[c] for c in chars]
        target = stoi[target_char]

        # pad
        pad_len = MAX_LEN - len(ids)
        ids += [PAD_ID] * pad_len

        return (
            torch.tensor(ids),
            torch.tensor(mask_idx),
            torch.tensor(target),
        )

dataset = MaskedWordDataset(words)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:

# Model

class CharTransformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
        self.pos_embed = nn.Embedding(MAX_LEN, EMBED_DIM)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=EMBED_DIM,
            nhead=NUM_HEADS,
            dim_feedforward=EMBED_DIM * 4,
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, NUM_LAYERS)
        self.classifier = nn.Linear(EMBED_DIM, VOCAB_SIZE)

    def forward(self, x, mask_pos):
        B, T = x.shape
        pos = torch.arange(T, device=x.device).unsqueeze(0)

        x = self.embed(x) + self.pos_embed(pos)
        x = self.encoder(x)

        # gather masked position
        masked_emb = x[torch.arange(B), mask_pos]
        logits = self.classifier(masked_emb)

        return logits

model = CharTransformer().to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:

# Training

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for x, mask_pos, target in loader:
        x = x.to(DEVICE)
        mask_pos = mask_pos.to(DEVICE)
        target = target.to(DEVICE)

        logits = model(x, mask_pos)
        loss = criterion(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}: loss = {avg_loss:.4f}")

In [None]:

# Save model

torch.save(
    {
        "model": model.state_dict(),
        "stoi": stoi,
        "itos": itos,
    },
    "char_restorer.pt",
)

print("Training complete.")