In [3]:
import os
import time
import math
import csv
import random
from pathlib import Path
from datetime import datetime

import psutil
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2TokenizerFast
from datasets import load_dataset

# -------------------- USER CONFIG --------------------
OUT_DIR = Path("/kaggle/working/hindi-gpt")
CKPT_DIR = OUT_DIR / "checkpoints"
TK_DIR = OUT_DIR / "tokenizer"
LOG_DIR = OUT_DIR / "logs"
OUT_DIR.mkdir(parents=True, exist_ok=True)
CKPT_DIR.mkdir(exist_ok=True)
TK_DIR.mkdir(exist_ok=True)
LOG_DIR.mkdir(exist_ok=True)

# Model / tokenizer config
BLOCK_SIZE = 128      # context size
D_MODEL = 512         # embedding / model dim
N_HEAD = 8
N_LAYER = 8
DROPOUT = 0.1
VOCAB_SIZE = 32000    

# Training config
TOTAL_STEPS = 50000   
BATCH_SIZE = 16       
EVAL_INTERVAL = 2000
EVAL_ITERS = 100
LEARNING_RATE = 3e-4
SAVE_EVERY = 2000
SEED = 1337

# Data selection
TARGET_BYTES = 1 * 1024**3   # ~1 GB raw text
MAX_DOCS = None              # cap docs (None => no cap)

# Misc
USE_GRAD_ACCUM = False
GRAD_ACCUM_STEPS = 2
GENERATE_TOKENS = 200
INFERENCE_PROMPT = "भारत की तकनीकी प्रगति के बारे में विस्तृत जानकारी दीजिए और बताइए कि आने वाले वर्षों में भारत किस दिशा में आगे बढ़ेगा।"

# reproducibility
random.seed(SEED)
torch.manual_seed(SEED)

# device info
device = "cuda" if torch.cuda.is_available() else "cpu"
n_gpus = torch.cuda.device_count()
print(f"Device: {device}, GPUs: {n_gpus}")
if device == "cuda":
    for i in range(n_gpus):
        print(" GPU", i, ":", torch.cuda.get_device_name(i))
print("RAM (GB):", psutil.virtual_memory().total / 1e9)

# -------------------- LOAD DATASET (nisram) & extract ~1GB --------------------
print("Loading nis12ram/nisram-hindi-text-0.0 ...")
ds = load_dataset("nis12ram/nisram-hindi-text-0.0", split="train")

all_texts = ds["text"]
print("Total rows in dataset:", len(all_texts))

indices = list(range(len(all_texts)))
random.shuffle(indices)

texts = []
acc_bytes = 0
count = 0
for i in indices:
    if MAX_DOCS is not None and count >= MAX_DOCS:
        break
    t = all_texts[i]
    if not t:
        continue
    b = t.encode("utf-8")
    acc_bytes += len(b)
    texts.append(t)
    count += 1
    if count % 20000 == 0:
        print(f"Collected {count:,} docs, {acc_bytes/1e9:.3f} GB")
    if acc_bytes >= TARGET_BYTES:
        break

print(f"Selected {len(texts):,} documents, approx {acc_bytes/1e9:.3f} GB")

raw_text_file = OUT_DIR / "nisram_hi_raw.txt"
with open(raw_text_file, "w", encoding="utf-8") as f:
    for t in texts:
        line = t.replace("\r\n", "\n").strip()
        if line == "":
            continue
        f.write(line + "\n")
print("Saved raw text file:", raw_text_file, "size GB:", raw_text_file.stat().st_size / 1e9)

# -------------------- TRAIN BYTE-LEVEL BPE TOKENIZER --------------------
print("Training ByteLevel BPE tokenizer (vocab_size = {}) ...".format(VOCAB_SIZE))
bpe = ByteLevelBPETokenizer()
bpe.train(files=[str(raw_text_file)], vocab_size=VOCAB_SIZE, min_frequency=2,
          special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
bpe.save_model(str(TK_DIR))
print("Tokenizer model files saved to:", TK_DIR)

# wrap as HuggingFace fast tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(str(TK_DIR))
tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
})
print("Tokenizer vocab size reported:", tokenizer.vocab_size)

# -------------------- TOKENIZE & BUILD 1D TOKEN STREAM --------------------
print("Tokenizing texts and building token stream ...")
all_ids = []
batch_tokenize = 500
for i in range(0, len(texts), batch_tokenize):
    batch = texts[i:i+batch_tokenize]
    enc = tokenizer(batch, add_special_tokens=False)
    for seq in enc["input_ids"]:
        all_ids.extend(seq + [tokenizer.eos_token_id])
    if (i // batch_tokenize) % 20 == 0:
        print(f"Tokenized up to doc {i+len(batch):,}, tokens so far: {len(all_ids):,}")

data = torch.tensor(all_ids, dtype=torch.long)
print("Total tokens in dataset:", len(data))

# -------------------- TRAIN / VAL SPLIT --------------------
n = int(0.9 * len(data))
train_data = data[:n].to(device)
val_data = data[n:].to(device)
print("Train tokens:", len(train_data), "Val tokens:", len(val_data))

# -------------------- BATCH / GET_BATCH --------------------
def get_batch(split):
    data_local = train_data if split == "train" else val_data
    max_start = len(data_local) - BLOCK_SIZE - 1
    ix = torch.randint(0, max_start, (BATCH_SIZE,))
    x = torch.stack([data_local[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([data_local[i+1:i+BLOCK_SIZE+1] for i in ix])
    return x, y

# -------------------- MODEL DEFINITION (GPT-style) --------------------
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(D_MODEL, head_size, bias=False)
        self.query = nn.Linear(D_MODEL, head_size, bias=False)
        self.value = nn.Linear(D_MODEL, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (q.size(-1) ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(D_MODEL, D_MODEL)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(D_MODEL, 4 * D_MODEL),
            nn.GELU(),
            nn.Linear(4 * D_MODEL, D_MODEL),
            nn.Dropout(DROPOUT),
        )
    def forward(self, x): return self.net(x)

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = D_MODEL // N_HEAD
        self.sa = MultiHeadAttention(N_HEAD, head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(D_MODEL)
        self.ln2 = nn.LayerNorm(D_MODEL)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(tokenizer.vocab_size, D_MODEL)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, D_MODEL)
        self.blocks = nn.Sequential(*[Block() for _ in range(N_LAYER)])
        self.ln_f = nn.LayerNorm(D_MODEL)
        self.lm_head = nn.Linear(D_MODEL, tokenizer.vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            # ensure reduction='mean' so per-GPU outputs can be averaged safely
            loss = F.cross_entropy(logits.view(B*T, -1), targets.view(B*T), reduction='mean')
        return logits, loss

# -------------------- INSTANTIATE MODEL & OPTIMIZER --------------------
model = GPT().to(device)
param_count = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {param_count/1e6:.2f}M")

# DataParallel (simple multi-GPU)
if device == "cuda" and n_gpus > 1:
    model = nn.DataParallel(model)
    print("Wrapped model in DataParallel for", n_gpus, "GPUs")

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scaler = torch.amp.GradScaler(enabled=(device=="cuda"))

# -------------------- EVAL / METRICS --------------------
@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ["train", "val"]:
        losses = []
        for _ in range(EVAL_ITERS):
            xb, yb = get_batch(split)
            xb = xb.to(device); yb = yb.to(device)
            with torch.amp.autocast(device_type='cuda' if device=='cuda' else 'cpu', enabled=(device=="cuda")):
                _, loss = model(xb, yb)
            # reduce before reading (handles DataParallel returning per-device tensors)
            losses.append(loss.mean().item())
        out[split] = float(np.mean(losses))
    model.train()
    return out

def ppl_from_loss(loss_val):
    return math.exp(loss_val) if loss_val < 50 else float("inf")

# -------------------- GENERATION (greedy) --------------------
@torch.no_grad()
def generate_text(prompt, max_new_tokens=200):
    model.eval()
    tokens = tokenizer.encode(prompt)
    idx = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)
    for _ in range(max_new_tokens):
        if idx.size(1) > BLOCK_SIZE:
            idx_cond = idx[:, -BLOCK_SIZE:]
        else:
            idx_cond = idx
        with torch.amp.autocast(device_type='cuda' if device=='cuda' else 'cpu', enabled=(device=="cuda")):
            logits, _ = model(idx_cond)
        logits = logits[:, -1, :]
        next_id = torch.argmax(F.softmax(logits, dim=-1), dim=-1).unsqueeze(0)
        idx = torch.cat((idx, next_id), dim=1)
    text = tokenizer.decode(idx.squeeze().tolist(), skip_special_tokens=True)
    model.train()
    return text

# -------------------- LOGGING SETUP --------------------
log_csv = LOG_DIR / "train_log.csv"
with open(log_csv, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["step","train_loss_iter","eval_train_loss","eval_val_loss","eval_train_ppl","eval_val_ppl","step_time_s","save_time_s","timestamp"])

loss_history = []
eval_history = []
perp_history = []
timings = []

# -------------------- TRAIN LOOP --------------------
print("Starting training for", TOTAL_STEPS, "steps ...")
global_start = time.time()
step = 0

# initial eval
init = estimate_loss()
print(f"Initial eval — Train {init['train']:.4f}, Val {init['val']:.4f}")

while step < TOTAL_STEPS:
    step += 1
    t0 = time.time()

    xb, yb = get_batch("train")
    xb, yb = xb.to(device), yb.to(device)

    # forward/backward with AMP
    with torch.amp.autocast(device_type='cuda' if device=='cuda' else 'cpu', enabled=(device=="cuda")):
        logits, loss = model(xb, yb)

    # reduce across devices (in case DataParallel returns vector-like loss)
    loss = loss.mean()

    # gradient scaling / backward
    loss_to_backprop = loss / (GRAD_ACCUM_STEPS if USE_GRAD_ACCUM else 1)
    scaler.scale(loss_to_backprop).backward()

    if (not USE_GRAD_ACCUM) or (step % GRAD_ACCUM_STEPS == 0):
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

    step_time = time.time() - t0
    iter_train_loss = float(loss.item())
    loss_history.append((step, iter_train_loss))

    save_time_s = ""
    # eval and checkpointing
    if step % EVAL_INTERVAL == 0 or step == TOTAL_STEPS:
        t_eval_start = time.time()
        metrics = estimate_loss()
        t_eval = time.time() - t_eval_start
        train_loss_eval = metrics["train"]
        val_loss_eval = metrics["val"]
        train_ppl = ppl_from_loss(train_loss_eval)
        val_ppl = ppl_from_loss(val_loss_eval)
        print(f"[Step {step}] iter_loss {iter_train_loss:.4f} | Eval Train {train_loss_eval:.4f} Val {val_loss_eval:.4f} | ValPPL {val_ppl:.2f} | eval_time {t_eval:.1f}s")
        eval_history.append((step, train_loss_eval, val_loss_eval))
        perp_history.append((step, val_ppl))

    if step % SAVE_EVERY == 0 or step == TOTAL_STEPS:
        t_save_start = time.time()
        model_to_save = model.module if isinstance(model, nn.DataParallel) else model
        torch.save(model_to_save.state_dict(), CKPT_DIR / f"gpt2_hindi_step{step}.pt")
        tokenizer.save_pretrained(str(TK_DIR))
        save_time_s = time.time() - t_save_start
        print(f"Saved checkpoint step {step} (save_time {save_time_s:.2f}s)")

    # append log row
    with open(log_csv, "a", newline="") as f:
        writer = csv.writer(f)
        row = [
            step,
            iter_train_loss,
            train_loss_eval if ("train_loss_eval" in locals() or "train_loss_eval" in globals()) else "",
            val_loss_eval if ("val_loss_eval" in locals() or "val_loss_eval" in globals()) else "",
            ppl_from_loss(train_loss_eval) if ("train_loss_eval" in locals() or "train_loss_eval" in globals()) else "",
            val_ppl if ("val_ppl" in locals() or "val_ppl" in globals()) else "",
            round(step_time, 6),
            save_time_s,
            datetime.utcnow().isoformat()
        ]
        writer.writerow(row)

    timings.append({"step": step, "step_time_s": step_time, "loss": iter_train_loss, "save_time_s": save_time_s})

# END training
total_time = time.time() - global_start
print("Training finished. Total time (s):", total_time)

# final save
t_final_save_start = time.time()
model_to_save = model.module if isinstance(model, nn.DataParallel) else model
torch.save(model_to_save.state_dict(), CKPT_DIR / "gpt2_hindi_final.pt")
tokenizer.save_pretrained(str(TK_DIR))
t_final_save = time.time() - t_final_save_start
print("Final model saved (s):", t_final_save)

# -------------------- INFERENCE TIMING --------------------
print("Generating sample for prompt (timing)...")
t_inf_start = time.time()
generated = generate_text(INFERENCE_PROMPT, max_new_tokens=GENERATE_TOKENS)
t_inf = time.time() - t_inf_start
print("Inference time (s) for {} tokens: {:.3f}s".format(GENERATE_TOKENS, t_inf))

with open(OUT_DIR / "sample_generation.txt", "w", encoding="utf-8") as f:
    f.write("PROMPT:\n" + INFERENCE_PROMPT + "\n\n")
    f.write("GENERATED:\n" + generated + "\n\n")
    f.write(f"Inference time (s): {t_inf}\n")

# -------------------- PLOTS --------------------
steps_arr = [s for s, l in loss_history]
loss_arr = [l for s, l in loss_history]
eval_steps = [s for s, tr, va in eval_history]
eval_train_arr = [tr for s, tr, va in eval_history]
eval_val_arr = [va for s, tr, va in eval_history]
perp_steps = [s for s, p in perp_history]
perp_arr = [p for s, p in perp_history]

if len(loss_arr) > 0:
    plt.figure(figsize=(10,6))
    plt.plot(steps_arr, loss_arr, label="iter train loss", alpha=0.7)
    if len(eval_steps) > 0:
        plt.plot(eval_steps, eval_train_arr, label="eval train loss", marker="o")
        plt.plot(eval_steps, eval_val_arr, label="eval val loss", marker="o")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.title("Loss vs Step")
    plt.legend()
    plt.grid(True)
    plt.savefig(OUT_DIR / "loss_curve.png", bbox_inches="tight")
    plt.close()
    print("Saved loss plot:", OUT_DIR / "loss_curve.png")

if len(perp_arr) > 0:
    plt.figure(figsize=(10,6))
    plt.plot(perp_steps, perp_arr, label="val perplexity", marker="o")
    plt.xlabel("Step")
    plt.ylabel("Perplexity")
    plt.title("Perplexity vs Step")
    plt.legend()
    plt.grid(True)
    plt.savefig(OUT_DIR / "perplexity_curve.png", bbox_inches="tight")
    plt.close()
    print("Saved perplexity plot:", OUT_DIR / "perplexity_curve.png")

# save timings CSV
timings_csv = OUT_DIR / "timings.csv"
with open(timings_csv, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["step","step_time_s","loss","save_time_s"])
    writer.writeheader()
    for r in timings:
        writer.writerow(r)
print("Saved timings CSV:", timings_csv)

# -------------------- SUMMARY --------------------
summary = {
    "total_steps": TOTAL_STEPS,
    "batch_size": BATCH_SIZE,
    "block_size": BLOCK_SIZE,
    "d_model": D_MODEL,
    "n_head": N_HEAD,
    "n_layer": N_LAYER,
    "vocab_size": tokenizer.vocab_size,
    "final_checkpoint": str(CKPT_DIR / "gpt2_hindi_final.pt"),
    "tokenizer_dir": str(TK_DIR),
    "loss_plot": str(OUT_DIR / "loss_curve.png"),
    "perplexity_plot": str(OUT_DIR / "perplexity_curve.png"),
    "timings_csv": str(timings_csv),
    "inference_time_s": t_inf
}
print("SUMMARY:")
for k,v in summary.items():
    print(f"  {k}: {v}")

print("All outputs saved under:", OUT_DIR)


Device: cuda, GPUs: 2
 GPU 0 : Tesla T4
 GPU 1 : Tesla T4
RAM (GB): 33.662328832
Loading nis12ram/nisram-hindi-text-0.0 ...
Total rows in dataset: 601628
Collected 20,000 docs, 0.091 GB
Collected 40,000 docs, 0.181 GB
Collected 60,000 docs, 0.271 GB
Collected 80,000 docs, 0.361 GB
Collected 100,000 docs, 0.451 GB
Collected 120,000 docs, 0.542 GB
Collected 140,000 docs, 0.632 GB
Collected 160,000 docs, 0.722 GB
Collected 180,000 docs, 0.811 GB
Collected 200,000 docs, 0.902 GB
Collected 220,000 docs, 0.991 GB
Selected 238,301 documents, approx 1.074 GB
Saved raw text file: /kaggle/working/hindi-gpt/nisram_hi_raw.txt size GB: 1.073857009
Training ByteLevel BPE tokenizer (vocab_size = 32000) ...



Tokenizer model files saved to: /kaggle/working/hindi-gpt/tokenizer
Tokenizer vocab size reported: 32000
Tokenizing texts and building token stream ...
Tokenized up to doc 500, tokens so far: 580,802
Tokenized up to doc 10,500, tokens so far: 11,988,665
Tokenized up to doc 20,500, tokens so far: