In [1]:
import os
import math
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from tokenizers import Tokenizer

# --------------------------
# Config
# --------------------------
batch_size = 8
block_size = 1024
max_iters =  80000         #72000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 512
n_head = 8
n_layer = 8
dropout = 0.1

checkpoint_last = "/kaggle/input/53k_itr_gpt_bpt/pytorch/default/1/checkpoint_last.pt"

torch.manual_seed(1337)


# --------------------------
# Load Trained BPE Tokenizer
# --------------------------
tokenizer = Tokenizer.from_file("/kaggle/input/hindi-token/hindi_bpe_tokenizer-45k.json")

def encode(s):
    return tokenizer.encode(s).ids

def decode(ids):
    return tokenizer.decode(ids)

vocab_size = tokenizer.get_vocab_size()
print("Tokenizer vocab size:", vocab_size)

# --------------------------
# Memmap Tokenization
# --------------------------
dataset_path = "/kaggle/input/dataset-bpt/wikipedia_hindi_500mb.txt"
tokens_memmap_path = "/kaggle/input/53k_itr_gpt_bpt/pytorch/default/1/tokens.dat"
tokens_len_cache = "/kaggle/input/53k_itr_gpt_bpt/pytorch/default/1/tokens_len.txt.txt"
chunk_size_chars = 200_000

def tokenize_count_pass(path, chunk_size=chunk_size_chars):
    total = 0
    uniq = set()
    with open(path, "r", encoding="utf-8") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            enc = tokenizer.encode(chunk)
            ids = enc.ids
            total += len(ids)
            uniq.update(ids)
    return total, uniq

def tokenize_write_memmap(path, total_tokens, memmap_path, chunk_size=chunk_size_chars):
    dtype = np.int32
    mm = np.memmap(memmap_path, mode="w+", dtype=dtype, shape=(total_tokens,))
    pos = 0
    with open(path, "r", encoding="utf-8") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            ids = tokenizer.encode(chunk).ids
            L = len(ids)
            if L:
                mm[pos:pos+L] = np.array(ids, dtype=dtype)
                pos += L
    if pos != total_tokens:
        print(f"Warning: expected {total_tokens} tokens but wrote {pos}.")
    mm.flush()
    return mm

# Check for existing memmap
use_existing_memmap = True
if os.path.exists(tokens_memmap_path) and os.path.exists(tokens_len_cache):
    try:
        with open(tokens_len_cache, "r") as f:
            cached_len = int(f.read().strip())
        if os.path.getsize(tokens_memmap_path) == cached_len * np.dtype(np.int32).itemsize:
            use_existing_memmap = True
            total_tokens = cached_len
            print("Reusing existing tokens memmap.")
    except Exception:
        use_existing_memmap = False

if not use_existing_memmap:
    print("Tokenizing (pass 1: counting tokens)...")
    total_tokens, uniq_tokens_set = tokenize_count_pass(dataset_path)
    print(f"Total tokens: {total_tokens:,}, Unique tokens: {len(uniq_tokens_set):,}")
    
    print("Tokenizing (pass 2: writing memmap)...")
    mm = tokenize_write_memmap(dataset_path, total_tokens, tokens_memmap_path)
    with open(tokens_len_cache, "w") as f:
        f.write(str(total_tokens))
    print("Memmap written.")
else:
    total_tokens = cached_len
    mm = np.memmap(tokens_memmap_path, mode="r", dtype=np.int32, shape=(total_tokens,))
    uniq_tokens_set = set()
    print("Memmap loaded.")

# Compute unique tokens safely if not available
if len(uniq_tokens_set) == 0:
    uniq = set()
    step = 2_000_000
    for start in range(0, total_tokens, step):
        end = min(start + step, total_tokens)
        uniq.update(np.unique(mm[start:end]).tolist())
    uniq_tokens_set = uniq

num_tokens = total_tokens
unique_tokens = len(uniq_tokens_set)

# Dataset text stats
with open(dataset_path, "r", encoding="utf-8") as f:
    first_chunk = f.read(1_000_000)
num_chars_total_approx = os.path.getsize(dataset_path)
unique_words_sample = len(set(first_chunk.split()))

print("=== Dataset & Tokenizer stats ===")
print(f"Dataset file: {dataset_path}")
print(f"Dataset size (bytes): {num_chars_total_approx:,}")
print(f"Sample unique words (first 1M chars): {unique_words_sample:,}")
print(f"Total tokens: {num_tokens:,}, Unique tokens observed: {unique_tokens:,}")
print(f"Tokenizer vocab size: {vocab_size:,}")

# --------------------------
# Prepare torch data
# --------------------------
tokens_np = np.memmap(tokens_memmap_path, mode="r", dtype=np.int32, shape=(total_tokens,))
data = torch.from_numpy(tokens_np)
n = int(0.9 * len(data))
train_data = data[:n].long()
val_data = data[n:].long()
print("Train tokens:", len(train_data), "Val tokens:", len(val_data))

# --------------------------
# Transformer Model
# --------------------------
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T]==0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        return x + self.ffwd(self.ln2(x))

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    def forward(self, idx, targets=None):
        B,T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B,T,C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# --------------------------
# Data loader
# --------------------------
def get_batch(split):
    data_tensor = train_data if split=="train" else val_data
    max_start = len(data_tensor)-block_size-1
    if max_start <= 0:
        raise ValueError("Dataset too small for given block_size")
    ix = torch.randint(0, max_start, (batch_size,))
    x = torch.stack([data_tensor[i:i+block_size] for i in ix])
    y = torch.stack([data_tensor[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# --------------------------
# Loss & Perplexity estimate
# --------------------------
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            _, loss = model(X,Y)
            losses[k] = loss.item()
        mean_loss = losses.mean()
        out[split] = mean_loss
        out[split+"_ppl"] = math.exp(mean_loss)
    model.train()
    return out

# --------------------------
# Initialize model, optimizer
# --------------------------
model = BigramLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
print(f"Model params: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
print(f"Device: {device}")
if device=="cuda":
    print("GPU:", torch.cuda.get_device_name(0))

# --------------------------
# Resume checkpoint if exists
# --------------------------
start_iter = 0
best_val_loss = float("inf")
history = {
    "step": [],
    "train_loss": [],
    "val_loss": [],
    "train_ppl": [],
    "val_ppl": []
}

if os.path.exists(checkpoint_last):
    checkpoint = torch.load(checkpoint_last, map_location=device)
    model.load_state_dict(checkpoint["model"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    start_iter = checkpoint["iter"] + 1
    best_val_loss = checkpoint.get("best_val_loss", float("inf"))
    history = checkpoint.get("history", history)
    print(f"Resumed from iter {start_iter}, best val loss {best_val_loss:.4f}")

# --------------------------
# Training loop
# --------------------------
checkpoint_dir = "/kaggle/working/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

last_ckpt = os.path.join(checkpoint_dir, "checkpoint_last-af-52.pt")

for iter in range(start_iter, max_iters):
    if iter % eval_interval == 0 or iter == max_iters-1:
        stats = estimate_loss()
        val_loss = stats["val"]
        print(f"step {iter}: train {stats['train']:.4f}, val {val_loss:.4f}, "
              f"train ppl {stats['train_ppl']:.2f}, val ppl {stats['val_ppl']:.2f}")

        # Store history
        history["step"].append(iter)
        history["train_loss"].append(stats["train"])
        history["val_loss"].append(stats["val"])
        history["train_ppl"].append(stats["train_ppl"])
        history["val_ppl"].append(stats["val_ppl"])

        # Save last checkpoint
        torch.save({
            "iter": iter,
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "best_val_loss": best_val_loss,
            "history": history
        }, last_ckpt)
        print(f"Checkpoint saved at step {iter}")


    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# --------------------------
# Generate sample text
# --------------------------
# context = torch.zeros((1,1), dtype=torch.long, device=device)
# out = model.generate(context, max_new_tokens=200)
# print("Generated text:\n", decode(out[0].tolist()))
prompt = "भारत की राजधानी"
context = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
out = model.generate(context, max_new_tokens=200)
print("Generated text:\n", decode(out[0].tolist()))


Tokenizer vocab size: 45000
Reusing existing tokens memmap.
Memmap loaded.
=== Dataset & Tokenizer stats ===
Dataset file: /kaggle/input/dataset-bpt/wikipedia_hindi_500mb.txt
Dataset size (bytes): 525,475,460
Sample unique words (first 1M chars): 26,741
Total tokens: 45,675,759, Unique tokens observed: 44,923
Tokenizer vocab size: 45,000
Train tokens: 41108183 Val tokens: 4567576


  data = torch.from_numpy(tokens_np)


Model params: 71.86M
Device: cuda
GPU: Tesla P100-PCIE-16GB
Resumed from iter 52001, best val loss 4.1007
step 52500: train 3.5839, val 4.1311, train ppl 36.01, val ppl 62.25
Checkpoint saved at step 52500
step 53000: train 3.6107, val 4.1150, train ppl 36.99, val ppl 61.25
Checkpoint saved at step 53000
step 53500: train 3.5749, val 4.1408, train ppl 35.69, val ppl 62.85
Checkpoint saved at step 53500
step 54000: train 3.5765, val 4.1298, train ppl 35.75, val ppl 62.17
Checkpoint saved at step 54000
step 54500: train 3.5382, val 4.1328, train ppl 34.41, val ppl 62.35
Checkpoint saved at step 54500
step 55000: train 3.5502, val 4.1452, train ppl 34.82, val ppl 63.13
Checkpoint saved at step 55000
step 55500: train 3.5438, val 4.1308, train ppl 34.60, val ppl 62.23
Checkpoint saved at step 55500
step 56000: train 3.5462, val 4.1246, train ppl 34.68, val ppl 61.84
Checkpoint saved at step 56000
step 56500: train 3.5428, val 4.1120, train ppl 34.56, val ppl 61.07
Checkpoint saved at step 