In [1]:
!pip install --upgrade --no-cache-dir transformers==4.45.2 indic-transliteration -q
!pip install --no-cache-dir scikit-learn pandas tqdm matplotlib sentencepiece -q

import os, json, random, numpy as np, torch
from pathlib import Path

# ---- Repro / device / dirs
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WORK_DIR = Path("/kaggle/working"); WORK_DIR.mkdir(parents=True, exist_ok=True)

# ---- Config
TEACHER_MODEL_ID = "csebuetnlp/banglabert"
STUDENT_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_LEN = 128
BATCH_SIZE = 16

# Teacher FT
EPOCHS_TEACHER = 3
LR_TEACHER = 2e-5
WARMUP_RATIO_T = 0.1
WEIGHT_DECAY_T = 0.01

# KD
EPOCHS_STUDENT = 5
LR_STUDENT = 3e-5
WARMUP_RATIO_S = 0.1
WEIGHT_DECAY_S = 0.01
PATIENCE = 2

KD_T = 3.0
KD_ALPHA = 0.5
GAMMA_HIDDEN = 1.0

print("‚úÖ Device:", DEVICE)


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.4/44.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.9/9.9 MB[0m [31m113.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m159.6/159.6 kB[0m [31m330.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m566.1/566.1 kB[0m [31m351.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.0/3.0 MB[0m [31m289.9 MB/s[0m eta [36m0

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

DATA_DIR = Path("/kaggle/input/dataaaaaaa")
POS_FILE = DATA_DIR / "all_positive_8500.txt"
NEG_FILE = DATA_DIR / "all_negative_3307.txt"
assert POS_FILE.exists() and NEG_FILE.exists(), f"Missing data: {POS_FILE}, {NEG_FILE}"

def read_txt(p: Path):
    with open(p, encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

pos, neg = read_txt(POS_FILE), read_txt(NEG_FILE)
df = pd.DataFrame({"text": pos + neg, "label": [1]*len(pos) + [0]*len(neg)}).sample(frac=1, random_state=SEED)

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=SEED)
val_df,   test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=SEED)
print(f"Train={len(train_df)} | Val={len(val_df)} | Test={len(test_df)}")


Train=9445 | Val=1181 | Test=1181


# train teacher

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

class TxtClsDataset(Dataset):
    def __init__(self, df, tok, max_len):
        self.texts = df.text.tolist(); self.labels = df.label.tolist()
        self.tok, self.max_len = tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(self.texts[i], truncation=True, padding="max_length",
                       max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[i], dtype=torch.long)
        }

teacher_tok = AutoTokenizer.from_pretrained(TEACHER_MODEL_ID)
teacher = AutoModelForSequenceClassification.from_pretrained(TEACHER_MODEL_ID, num_labels=2).to(DEVICE)

tr_loader = DataLoader(TxtClsDataset(train_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
va_loader = DataLoader(TxtClsDataset(val_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)
te_loader = DataLoader(TxtClsDataset(test_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)

opt = AdamW(teacher.parameters(), lr=LR_TEACHER, weight_decay=WEIGHT_DECAY_T)
steps = len(tr_loader) * EPOCHS_TEACHER
sch = get_linear_schedule_with_warmup(opt, int(WARMUP_RATIO_T*steps), steps)
criterion = torch.nn.CrossEntropyLoss()

best_f1 = -1
for ep in range(1, EPOCHS_TEACHER+1):
    teacher.train(); total = 0
    for b in tqdm(tr_loader, desc=f"Teacher Epoch {ep}/{EPOCHS_TEACHER}"):
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = teacher(**b)
        loss = out.loss
        loss.backward()
        opt.step(); sch.step(); opt.zero_grad()
        total += loss.item()

    teacher.eval(); preds, gold = [], []
    with torch.no_grad():
        for b in va_loader:
            b = {k:v.to(DEVICE) for k,v in b.items()}
            out = teacher(**b)
            preds += out.logits.argmax(-1).cpu().tolist()
            gold  += b["labels"].cpu().tolist()
    acc = accuracy_score(gold, preds)
    f1m = f1_score(gold, preds, average="macro")
    print(f"Val: Acc={acc:.4f} | F1_macro={f1m:.4f}")
    if f1m > best_f1:
        best_f1 = f1m
        save_dir = WORK_DIR / "finetuned_banglabert"
        save_dir.mkdir(parents=True, exist_ok=True)
        teacher.save_pretrained(save_dir)
        teacher_tok.save_pretrained(save_dir)
        print("üíæ Saved best BanglaBERT teacher.")

# quick test
teacher.eval(); preds, gold = [], []
with torch.no_grad():
    for b in te_loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = teacher(**b)
        preds += out.logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
print("‚úÖ BanglaBERT Teacher [Test]: Acc={:.4f} | F1_macro={:.4f}".format(
    accuracy_score(gold, preds), f1_score(gold, preds, average="macro")))


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Teacher Epoch 1/3:   0%|          | 0/591 [00:00<?, ?it/s]

Val: Acc=0.9577 | F1_macro=0.9489
üíæ Saved best BanglaBERT teacher.


Teacher Epoch 2/3:   0%|          | 0/591 [00:00<?, ?it/s]

Val: Acc=0.9670 | F1_macro=0.9586
üíæ Saved best BanglaBERT teacher.


Teacher Epoch 3/3:   0%|          | 0/591 [00:00<?, ?it/s]

Val: Acc=0.9678 | F1_macro=0.9601
üíæ Saved best BanglaBERT teacher.
‚úÖ BanglaBERT Teacher [Test]: Acc=0.9687 | F1_macro=0.9611


# KD Data (Transliteration, 2 tokenizers)

In [5]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

def transliterate_bn_text(txt: str) -> str:
    try:
        return transliterate(txt, sanscript.BENGALI, sanscript.ITRANS)
    except Exception:
        return txt

from transformers import AutoTokenizer
student_tok = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

class KDDataset(Dataset):
    def __init__(self, df, t_tok, s_tok, max_len):
        self.texts = df.text.tolist(); self.labels = df.label.tolist()
        self.ttok, self.stok, self.max_len = t_tok, s_tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        txt_bn = self.texts[i]; txt_en = transliterate_bn_text(txt_bn)
        t = self.ttok(txt_bn, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        s = self.stok(txt_en, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "t_input_ids": t["input_ids"].squeeze(0),
            "t_attention_mask": t["attention_mask"].squeeze(0),
            "s_input_ids": s["input_ids"].squeeze(0),
            "s_attention_mask": s["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[i], dtype=torch.long)
        }

def pad_collate(batch, t_pad, s_pad):
    out = {}
    for k in batch[0]:
        if k == "labels": out[k] = torch.stack([b[k] for b in batch])
        elif k.startswith("t_"):
            padv = 0 if "attention" in k else t_pad
            out[k] = nn.utils.rnn.pad_sequence([b[k] for b in batch], batch_first=True, padding_value=padv)
        elif k.startswith("s_"):
            padv = 0 if "attention" in k else s_pad
            out[k] = nn.utils.rnn.pad_sequence([b[k] for b in batch], batch_first=True, padding_value=padv)
    return out

train_loader = DataLoader(KDDataset(train_df, teacher_tok, student_tok, MAX_LEN),
                          batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=lambda b: pad_collate(b, teacher_tok.pad_token_id, student_tok.pad_token_id))
val_loader = DataLoader(KDDataset(val_df, teacher_tok, student_tok, MAX_LEN),
                        batch_size=BATCH_SIZE,
                        collate_fn=lambda b: pad_collate(b, teacher_tok.pad_token_id, student_tok.pad_token_id))
test_loader = DataLoader(KDDataset(test_df, teacher_tok, student_tok, MAX_LEN),
                         batch_size=BATCH_SIZE,
                         collate_fn=lambda b: pad_collate(b, teacher_tok.pad_token_id, student_tok.pad_token_id))
print("‚úÖ KD dataloaders ready (BanglaBERT‚ÜíMiniLM).")


‚úÖ KD dataloaders ready (BanglaBERT‚ÜíMiniLM).


# Student (MiniLM-L6-v2) head (logits + hidden)

In [6]:
import torch.nn as nn
from transformers import AutoModel

class StudentClassifier(nn.Module):
    def __init__(self, base_model_id, num_labels=2, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_id)
        s_H = self.encoder.config.hidden_size            # MiniLM hidden size (often 384)
        self.s_hidden = s_H
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(s_H, num_labels)
    def forward(self, input_ids=None, attention_mask=None, **_):
        out = self.encoder(input_ids=input_ids,
                           attention_mask=attention_mask,
                           output_hidden_states=True,
                           return_dict=True)
        cls = out.last_hidden_state[:, 0, :]
        logits = self.fc(self.dropout(cls))
        return {"logits": logits, "hidden_states": out.hidden_states}

student = StudentClassifier(STUDENT_MODEL_ID).to(DEVICE)
print("‚úÖ Student initialized. Hidden size =", student.s_hidden)


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

‚úÖ Student initialized. Hidden size = 384


# KD Projection + Loss (CE + KL + HiddenProj)

In [7]:
import torch.nn.functional as F

# discover sizes
t_hidden = teacher.config.hidden_size    # XLM-R = 768
s_hidden = student.s_hidden              # MiniLM-L6-v2 = 384

class KDProjectionHead(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.bridge = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            nn.GELU(),
            nn.LayerNorm(out_dim)
        )
    def forward(self, x):
        return self.bridge(x)

proj_head = KDProjectionHead(s_hidden, t_hidden).to(DEVICE)

class KDLossProj(nn.Module):
    def __init__(self, T=3.0, alpha=0.5, gamma_h=1.0):
        super().__init__()
        self.T, self.alpha, self.gamma_h = T, alpha, gamma_h
        self.ce  = nn.CrossEntropyLoss()
        self.kld = nn.KLDivLoss(reduction="batchmean")
        self.mse = nn.MSELoss()

    @staticmethod
    def map_layers(n_s, n_t):
        # skip embeddings index 0; map hidden layers 1..n
        s_idx = list(range(1, n_s))  # student hidden_states length includes embeddings at 0
        t_idx = torch.linspace(1, n_t-1, steps=len(s_idx)).round().long().tolist()
        return list(zip(s_idx, t_idx))

    def forward(self, s_pack, t_pack, labels):
        # logits: CE + KL
        logits_s, logits_t = s_pack["logits"], t_pack["logits"]
        hard = self.ce(logits_s, labels)
        soft = self.kld(F.log_softmax(logits_s/self.T, dim=-1),
                        F.softmax(logits_t/self.T,  dim=-1)) * (self.T**2)
        loss = (1 - self.alpha)*hard + self.alpha*soft

        # hidden: MSE(proj(student_h), teacher_h) with proportional mapping
        hs, ht = s_pack.get("hidden_states", []), t_pack.get("hidden_states", [])
        if hs and ht:
            pairs = self.map_layers(len(hs), len(ht))
            h_losses = []
            for i_s, i_t in pairs:
                s_h = proj_head(hs[i_s])           # [B, L, t_hidden]
                t_h = ht[i_t]
                L = min(s_h.size(1), t_h.size(1))
                h_losses.append(self.mse(s_h[:, :L, :], t_h[:, :L, :]))
            if h_losses:
                loss = loss + GAMMA_HIDDEN * torch.stack(h_losses).mean()

        return loss

criterion = KDLossProj(T=KD_T, alpha=KD_ALPHA, gamma_h=GAMMA_HIDDEN)
print("‚úÖ KD loss & projection ready. (student‚Üíteacher dims: {}‚Üí{})".format(s_hidden, t_hidden))


‚úÖ KD loss & projection ready. (student‚Üíteacher dims: 384‚Üí768)


# KD Training (teacher frozen)

In [8]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# freeze teacher for KD
teacher.eval()
for p in teacher.parameters(): p.requires_grad = False

opt = AdamW(list(student.parameters()) + list(proj_head.parameters()),
            lr=LR_STUDENT, weight_decay=WEIGHT_DECAY_S)
num_steps = EPOCHS_STUDENT * len(train_loader)
sch = get_linear_schedule_with_warmup(opt, int(WARMUP_RATIO_S * num_steps), num_steps)

def metrics(preds, gold):
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted"),
    }

@torch.no_grad()
def eval_student(loader):
    student.eval(); proj_head.eval()
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = student(input_ids=b["s_input_ids"], attention_mask=b["s_attention_mask"])
        preds += out["logits"].argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return metrics(np.array(preds), np.array(gold))

best_f1, wait = -1.0, 0

for ep in range(1, EPOCHS_STUDENT+1):
    student.train(); proj_head.train()
    run = 0.0

    for b in tqdm(train_loader, desc=f"[KD Epoch {ep}/{EPOCHS_STUDENT}]"):
        labels = b["labels"].to(DEVICE)

        s_out = student(input_ids=b["s_input_ids"].to(DEVICE),
                        attention_mask=b["s_attention_mask"].to(DEVICE),
                        )

        with torch.no_grad():
            t_raw = teacher(input_ids=b["t_input_ids"].to(DEVICE),
                            attention_mask=b["t_attention_mask"].to(DEVICE),
                            output_hidden_states=True,
                            return_dict=True)
            t_out = {"logits": t_raw.logits, "hidden_states": t_raw.hidden_states}

        loss = criterion(s_out, t_out, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(list(student.parameters()) + list(proj_head.parameters()), 1.0)
        opt.step(); sch.step(); opt.zero_grad()
        run += loss.item()

    val = eval_student(val_loader)
    print(f"[KD] loss={run/len(train_loader):.4f} | Val Acc={val['accuracy']:.4f} | "
          f"F1m={val['f1_macro']:.4f} | F1w={val['f1_weighted']:.4f}")

    if val["f1_macro"] > best_f1:
        best_f1, wait = val["f1_macro"], 0
        torch.save({"student": student.state_dict(), "proj": proj_head.state_dict()},
                   WORK_DIR / "student_minilm_kd_best.pt")
        print("üíæ Saved best student.")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("‚è∏Ô∏è Early stopping.")
            break

# reload best
ckpt = torch.load(WORK_DIR / "student_minilm_kd_best.pt", map_location=DEVICE)
student.load_state_dict(ckpt["student"]); proj_head.load_state_dict(ckpt["proj"])
student.eval(); proj_head.eval()
print("‚úÖ KD complete & best reloaded.")




[KD Epoch 1/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=1.8668 | Val Acc=0.9170 | F1m=0.8952 | F1w=0.9162
üíæ Saved best student.


[KD Epoch 2/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=1.0859 | Val Acc=0.9136 | F1m=0.8837 | F1w=0.9096


[KD Epoch 3/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.9249 | Val Acc=0.9399 | F1m=0.9247 | F1w=0.9396
üíæ Saved best student.


[KD Epoch 4/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.8058 | Val Acc=0.9458 | F1m=0.9331 | F1w=0.9459
üíæ Saved best student.


[KD Epoch 5/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.7441 | Val Acc=0.9450 | F1m=0.9322 | F1w=0.9451
‚úÖ KD complete & best reloaded.


# Test Metrics + Alignment + Save

In [9]:
from scipy.special import softmax
from scipy.spatial.distance import cosine

@torch.no_grad()
def eval_model(model, loader, mode="student"):
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        inp = {"input_ids": b["t_input_ids"], "attention_mask": b["t_attention_mask"]} if mode=="teacher" else \
              {"input_ids": b["s_input_ids"], "attention_mask": b["s_attention_mask"]}
        out = model(**inp)
        logits = out["logits"] if isinstance(out, dict) else out.logits
        preds += logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted")
    }

print("üß™ Evaluating on test‚Ä¶")
teacher_test = eval_model(teacher, test_loader, mode="teacher")
student_test = eval_model(student, test_loader, mode="student")
print("[Teacher][Test]:", teacher_test)
print("[Student][Test]:", student_test)

@torch.no_grad()
def alignment_metrics(teacher, student, loader):
    cos_list, corr_list, agree = [], [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        t = teacher(b["t_input_ids"], b["t_attention_mask"])
        s = student(b["s_input_ids"], b["s_attention_mask"])
        t_logits = t.logits.detach().cpu().numpy()
        s_logits = s["logits"].detach().cpu().numpy()
        t_probs  = softmax(t_logits, axis=-1)
        s_probs  = softmax(s_logits, axis=-1)
        for tl, sl, tp, sp in zip(t_logits, s_logits, t_probs, s_probs):
            cos_list.append(1 - cosine(tl, sl))
            corr_list.append(np.corrcoef(tp, sp)[0, 1])
            agree.append(np.argmax(tp) == np.argmax(sp))
    return {
        "logit_cosine": float(np.nanmean(cos_list)),
        "prob_corr": float(np.nanmean(corr_list)),
        "pred_alignment": float(np.mean(agree))
    }

align = alignment_metrics(teacher, student, test_loader)
print(f"""
üß© Alignment (Test)
  ‚Ä¢ Logit cosine : {align['logit_cosine']:.4f}
  ‚Ä¢ Prob corr    : {align['prob_corr']:.4f}
  ‚Ä¢ Agreement    : {align['pred_alignment']:.4f}
""")

# ---- Save artifacts
SAVE_DIR = WORK_DIR / "student_minilm_translit_kd_proj"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
torch.save(student.state_dict(), SAVE_DIR / "pytorch_model.bin")
from transformers import AutoTokenizer
# student tokenizer saving
AutoTokenizer.from_pretrained(STUDENT_MODEL_ID).save_pretrained(SAVE_DIR)

meta = {
    "teacher_model": TEACHER_MODEL_ID,
    "student_model": STUDENT_MODEL_ID,
    "kd_temperature": KD_T,
    "alpha": KD_ALPHA,
    "gamma_hidden": GAMMA_HIDDEN,
    "max_len": MAX_LEN,
    "lr_student": LR_STUDENT,
    "epochs_student": EPOCHS_STUDENT
}
json.dump(meta, open(SAVE_DIR / "student_config.json", "w"), indent=2, ensure_ascii=False)

def to_py(o):
    if isinstance(o, dict): return {k: to_py(v) for k,v in o.items()}
    if hasattr(o, "item"): return o.item()
    return o

json.dump({"teacher_test": to_py(teacher_test),
           "student_test": to_py(student_test),
           "alignment": to_py(align)},
          open(WORK_DIR / "metrics_minilm_kd_proj.json", "w"), indent=2, ensure_ascii=False)

print("‚úÖ Saved student + metrics to:", SAVE_DIR, "and", WORK_DIR / "metrics_minilm_kd_proj.json")


üß™ Evaluating on test‚Ä¶
[Teacher][Test]: {'accuracy': 0.9686706181202371, 'f1_macro': 0.961136147554033, 'f1_weighted': 0.9686561287537637}
[Student][Test]: {'accuracy': 0.9458086367485182, 'f1_macro': 0.9329615280911632, 'f1_weighted': 0.9458583354280438}

üß© Alignment (Test)
  ‚Ä¢ Logit cosine : 0.9036
  ‚Ä¢ Prob corr    : 0.9069
  ‚Ä¢ Agreement    : 0.9534

‚úÖ Saved student + metrics to: /kaggle/working/student_minilm_translit_kd_proj and /kaggle/working/metrics_minilm_kd_proj.json


# Load the Student Model 

In [10]:
import torch
from pathlib import Path

ckpt_path = Path("/kaggle/working/student_minilm_kd_best.pt")
ckpt = torch.load(ckpt_path, map_location="cpu")

# find actual state_dict
for key in ["student", "model", "state_dict"]:
    if key in ckpt:
        state_dict = ckpt[key]
        print(f"Found state_dict under '{key}'")
        break
else:
    state_dict = ckpt
    print("No wrapper key found; using raw checkpoint")

# show first 40 parameter names
print("\n".join(list(state_dict.keys())[:40]))


Found state_dict under 'student'
encoder.embeddings.word_embeddings.weight
encoder.embeddings.position_embeddings.weight
encoder.embeddings.token_type_embeddings.weight
encoder.embeddings.LayerNorm.weight
encoder.embeddings.LayerNorm.bias
encoder.encoder.layer.0.attention.self.query.weight
encoder.encoder.layer.0.attention.self.query.bias
encoder.encoder.layer.0.attention.self.key.weight
encoder.encoder.layer.0.attention.self.key.bias
encoder.encoder.layer.0.attention.self.value.weight
encoder.encoder.layer.0.attention.self.value.bias
encoder.encoder.layer.0.attention.output.dense.weight
encoder.encoder.layer.0.attention.output.dense.bias
encoder.encoder.layer.0.attention.output.LayerNorm.weight
encoder.encoder.layer.0.attention.output.LayerNorm.bias
encoder.encoder.layer.0.intermediate.dense.weight
encoder.encoder.layer.0.intermediate.dense.bias
encoder.encoder.layer.0.output.dense.weight
encoder.encoder.layer.0.output.dense.bias
encoder.encoder.layer.0.output.LayerNorm.weight
encoder

# Load the Student Model

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from pathlib import Path

WORK_DIR = Path("/kaggle/working")
DEVICE = "cpu"

ckpt_path = WORK_DIR / "student_minilm_kd_best.pt"
base_model_name = "sentence-transformers/all-MiniLM-L6-v2"

ckpt = torch.load(ckpt_path, map_location="cpu")
state_dict = ckpt["student"] if "student" in ckpt else ckpt.get("model", ckpt.get("state_dict", ckpt))

def remap_key(k: str) -> str | None:
    # drop KD projection heads entirely
    if k.startswith("proj.") or "projection" in k:
        return None
    # classifier
    if k.startswith("fc."):
        return k.replace("fc.", "classifier.")
    # core renames from your checkpoint structure ‚Üí HF structure
    k = k.replace("encoder.encoder.", "bert.encoder.")      # encoder blocks
    k = k.replace("encoder.embeddings.", "bert.embeddings.")# embeddings
    k = k.replace("encoder.pooler.", "bert.pooler.")        # <-- FIXED: pooler
    # handle odd double nesting we saw earlier
    k = k.replace("bert.bert.", "bert.encoder.")
    k = k.replace("bert.encoder.encoder.", "bert.encoder.")
    return k

fixed_state = {}
for k, v in state_dict.items():
    nk = remap_key(k)
    if nk is not None:
        fixed_state[nk] = v

# build 2-class MiniLM and load
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)

missing, unexpected = model.load_state_dict(fixed_state, strict=False)

print("‚úÖ MiniLM KD (2-class) loaded with correct mapping.")
print("Missing keys:", missing)
print("Unexpected keys:", unexpected)

model.to(DEVICE).eval()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ MiniLM KD (2-class) loaded with correct mapping.
Missing keys: []
Unexpected keys: []


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-1

In [12]:
CLEAN_DIR = WORK_DIR / "student_minilm_kd_clean"
CLEAN_DIR.mkdir(parents=True, exist_ok=True)
model.save_pretrained(CLEAN_DIR)
tokenizer.save_pretrained(CLEAN_DIR)
print("Saved to:", CLEAN_DIR)


Saved to: /kaggle/working/student_minilm_kd_clean


# prune 

In [13]:
import torch.nn.utils.prune as prune

amount = 0.30  # 30% unstructured weight pruning
for m in model.modules():
    if isinstance(m, torch.nn.Linear):
        prune.l1_unstructured(m, name="weight", amount=amount)

# Remove reparam so weights are real tensors
for m in model.modules():
    if isinstance(m, torch.nn.Linear) and hasattr(m, "weight_orig"):
        prune.remove(m, "weight")

PRUNED_DIR = WORK_DIR / "student_minilm_pruned"
PRUNED_DIR.mkdir(exist_ok=True, parents=True)
model.save_pretrained(PRUNED_DIR)
tokenizer.save_pretrained(PRUNED_DIR)
print("Pruned model saved to:", PRUNED_DIR)


Pruned model saved to: /kaggle/working/student_minilm_pruned


In [14]:
from transformers import AutoModelForSequenceClassification

teacher_path = "/kaggle/working/finetuned_banglabert"  # your real teacher ckpt path
teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path)
teacher.to(DEVICE).eval()


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [15]:
batch = next(iter(train_loader))
print("Batch keys:", batch.keys())
for k, v in batch.items():
    print(f"{k}: shape {v.shape if torch.is_tensor(v) else type(v)}")


Batch keys: dict_keys(['t_input_ids', 't_attention_mask', 's_input_ids', 's_attention_mask', 'labels'])
t_input_ids: shape torch.Size([16, 128])
t_attention_mask: shape torch.Size([16, 128])
s_input_ids: shape torch.Size([16, 128])
s_attention_mask: shape torch.Size([16, 128])
labels: shape torch.Size([16])


# finetune for Accuracy Recovery

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score

# ====== DEVICE & CONFIG ======
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 5
LR = 2e-5
ALPHA = 0.7        # CE vs KD (logits)
BETA = 0.3         # hidden-state KD weight
TEMPERATURE = 3.0

print("‚úÖ Using device:", DEVICE, "| GPU:", torch.cuda.get_device_name(0) if DEVICE.type=="cuda" else "CPU")

# ====== SETUP ======
teacher.to(DEVICE).eval()
model.to(DEVICE).train()

# Projection layer: ELECTRA(768) ‚Üí MiniLM(384)
proj = nn.Linear(768, 384).to(DEVICE)
mse = nn.MSELoss()

opt = torch.optim.AdamW(
    list(model.parameters()) + list(proj.parameters()),
    lr=LR
)
scaler = torch.cuda.amp.GradScaler()   # ‚úÖ Mixed precision scaler

# ====== TRAINING LOOP ======
for epoch in range(1, EPOCHS + 1):
    model.train()
    proj.train()
    tot = 0

    for batch in tqdm(train_loader, desc=f"KD Recovery Epoch {epoch}/{EPOCHS}"):
        # Move batch to GPU
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        with torch.cuda.amp.autocast():  # ‚úÖ FP16 mixed precision
            # --- Forward (Student) ---
            out_s = model(
                input_ids=batch["s_input_ids"],
                attention_mask=batch["s_attention_mask"],
                output_hidden_states=True
            )
            logits_s = out_s.logits
            s_hidden = out_s.hidden_states[-1]  # [B, T, 384]

            # --- Forward (Teacher) ---
            with torch.no_grad():
                out_t = teacher(
                    input_ids=batch["t_input_ids"],
                    attention_mask=batch["t_attention_mask"],
                    output_hidden_states=True
                )
                logits_t = out_t.logits
                t_hidden = out_t.hidden_states[-1]  # [B, T, 768]

            # --- Compute Losses ---
            feat_loss = mse(s_hidden, proj(t_hidden))
            kd_loss = F.kl_div(
                F.log_softmax(logits_s / TEMPERATURE, dim=-1),
                F.softmax(logits_t / TEMPERATURE, dim=-1),
                reduction="batchmean"
            ) * (TEMPERATURE ** 2)
            ce_loss = F.cross_entropy(logits_s, batch["labels"])
            loss = ALPHA * ce_loss + (1 - ALPHA) * kd_loss + BETA * feat_loss

        opt.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(opt)
        scaler.update()

        tot += loss.item()

    print(f"Epoch {epoch}: avg_loss = {tot / len(train_loader):.4f}")
    torch.cuda.empty_cache()  # ‚úÖ free VRAM between epochs

# ====== SAVE MODEL ======
REC_DIR = WORK_DIR / "student_minilm_proj_kd_recovered_gpu"
REC_DIR.mkdir(parents=True, exist_ok=True)

model.cpu().save_pretrained(REC_DIR)
tokenizer.save_pretrained(REC_DIR)
torch.save(proj.state_dict(), REC_DIR / "proj_layer.pt")

print("‚úÖ Projection-KD MiniLM student saved to:", REC_DIR)

# ====== EVALUATION ======
model.eval().to(DEVICE)
proj.eval().to(DEVICE)
y_true, y_pred = [], []

for batch in tqdm(test_loader, desc="Evaluating"):
    batch = {k: v.to(DEVICE) for k, v in batch.items()}

    with torch.no_grad(), torch.cuda.amp.autocast():
        logits = model(
            input_ids=batch["s_input_ids"],
            attention_mask=batch["s_attention_mask"]
        ).logits

    preds = logits.argmax(-1).cpu().numpy()
    y_pred.extend(preds)
    y_true.extend(batch["labels"].cpu().numpy())

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="macro")
print(f"‚úÖ Test Accuracy: {acc:.4f} | Macro F1: {f1:.4f}")


‚úÖ Using device: cuda | GPU: Tesla T4


  scaler = torch.cuda.amp.GradScaler()   # ‚úÖ Mixed precision scaler


KD Recovery Epoch 1/5:   0%|          | 0/591 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():  # ‚úÖ FP16 mixed precision


Epoch 1: avg_loss = 0.2656


KD Recovery Epoch 2/5:   0%|          | 0/591 [00:00<?, ?it/s]

Epoch 2: avg_loss = 0.1508


KD Recovery Epoch 3/5:   0%|          | 0/591 [00:00<?, ?it/s]

Epoch 3: avg_loss = 0.1125


KD Recovery Epoch 4/5:   0%|          | 0/591 [00:00<?, ?it/s]

Epoch 4: avg_loss = 0.0993


KD Recovery Epoch 5/5:   0%|          | 0/591 [00:00<?, ?it/s]

Epoch 5: avg_loss = 0.0864
‚úÖ Projection-KD MiniLM student saved to: /kaggle/working/student_minilm_proj_kd_recovered_gpu


Evaluating:   0%|          | 0/74 [00:00<?, ?it/s]

  with torch.no_grad(), torch.cuda.amp.autocast():


‚úÖ Test Accuracy: 0.9500 | Macro F1: 0.9383


# Load Fine-Tuned Recovered Student

In [17]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch, os
from pathlib import Path

WORK_DIR = Path("/kaggle/working")
DEPLOY_DIR = WORK_DIR / "student_minilm_proj_kd_recovered_gpu"

# Load fine-tuned MiniLM (post-KD recovery)
model = AutoModelForSequenceClassification.from_pretrained(DEPLOY_DIR)
tokenizer = AutoTokenizer.from_pretrained(DEPLOY_DIR)

print("‚úÖ Loaded fine-tuned MiniLM student for deployment")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)


‚úÖ Loaded fine-tuned MiniLM student for deployment
Device: cuda


# Apply Dynamic Quantization (int8)

In [18]:
import torch.nn as nn
import torch.quantization

model.eval().cpu()  # move to CPU for quantization

quantized_model = torch.quantization.quantize_dynamic(
    model, {nn.Linear}, dtype=torch.qint8
)

torch.save(quantized_model.state_dict(), DEPLOY_DIR / "minilm_quantized.pt")
print("‚úÖ Quantized (int8) model ready ‚Äî much smaller & faster on CPU")


‚úÖ Quantized (int8) model ready ‚Äî much smaller & faster on CPU


# Export to ONNX (for edge / mobile inference)

In [19]:
import torch

# use the fine-tuned (FP32) model, not quantized one
export_model = model.eval().cpu()

# Example input
sample = tokenizer(
    "‡¶è‡¶ü‡¶æ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£ ‡¶Ö‡¶≠‡¶ø‡¶ú‡ßç‡¶û‡¶§‡¶æ!", 
    padding="max_length", 
    truncation=True, 
    max_length=128, 
    return_tensors="pt"
)

onnx_path = DEPLOY_DIR / "student_minilm_fp32.onnx"

torch.onnx.export(
    export_model,
    (sample["input_ids"], sample["attention_mask"]),
    onnx_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch"}, "attention_mask": {0: "batch"}},
    opset_version=17,
    do_constant_folding=True
)

print("‚úÖ Exported MiniLM (FP32) ONNX model:", onnx_path)


‚úÖ Exported MiniLM (FP32) ONNX model: /kaggle/working/student_minilm_proj_kd_recovered_gpu/student_minilm_fp32.onnx


# Load & Run ONNX Model with ONNXRuntime

In [20]:
!pip install -q onnxruntime

import onnxruntime as ort
import numpy as np

# Load ONNX model
onnx_sess = ort.InferenceSession(str(onnx_path), providers=["CPUExecutionProvider"])

# Prepare input for ONNX
inputs_onnx = {
    "input_ids": sample["input_ids"].numpy(),
    "attention_mask": sample["attention_mask"].numpy()
}

# Run inference
outputs = onnx_sess.run(["logits"], inputs_onnx)
pred = np.argmax(outputs[0], axis=-1)
print("‚úÖ ONNX inference output logits:", outputs[0])
print("‚úÖ Predicted class:", pred)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m17.4/17.4 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.0/46.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m86.8/86.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ ONNX inference output logits: [[-2.5146873  2.3884728]]
‚úÖ Predicted class: [1]


In [21]:
from onnxruntime.quantization import quantize_dynamic, QuantType
from pathlib import Path

fp32_path = DEPLOY_DIR / "student_minilm_fp32.onnx"
int8_path = DEPLOY_DIR / "student_minilm_int8.onnx"

quantize_dynamic(
    model_input=str(fp32_path),
    model_output=str(int8_path),
    weight_type=QuantType.QInt8
)

print("‚úÖ Quantized ONNX model saved at:", int8_path)


‚úÖ Quantized ONNX model saved at: /kaggle/working/student_minilm_proj_kd_recovered_gpu/student_minilm_int8.onnx


# Compare Model File Sizes

In [22]:
import os
from pathlib import Path

# Adjust to your actual directory
REC_DIR = Path("/kaggle/working/student_minilm_proj_kd_recovered_gpu")
DEPLOY_DIR = REC_DIR  # since all your exports are inside same folder

# Common possible file names

onnx_fp32_path    = DEPLOY_DIR / "student_minilm_fp32.onnx"  # ONNX FP32 export
onnx_int8_path    = DEPLOY_DIR / "student_minilm_int8.onnx"  # Quantized ONNX

def get_size(path):
    try:
        size_mb = os.path.getsize(path) / (1024 * 1024)
        return f"{size_mb:.2f} MB"
    except FileNotFoundError:
        return "‚ùå Not found"

print("üì¶ Model Size Comparison")

print(f"ONNX FP32:   {get_size(onnx_fp32_path)}")
print(f"ONNX INT8:   {get_size(onnx_int8_path)}")


üì¶ Model Size Comparison
ONNX FP32:   86.78 MB
ONNX INT8:   21.98 MB


# Benchmark ONNX FP32 vs INT8 Models

In [23]:
import time, onnxruntime as ort, numpy as np, pandas as pd
from tqdm.auto import tqdm

# ---------------------------------------------------------------------
#  ‚úÖ CONFIG
# ---------------------------------------------------------------------
DEVICE = "cpu"

# Example Bangla sentences for inference
texts = [
    "‡¶è‡¶ü‡¶æ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£ ‡¶Ö‡¶≠‡¶ø‡¶ú‡ßç‡¶û‡¶§‡¶æ!",         # positive
    "‡¶è‡¶á ‡¶∞‡ßá‡¶∏‡ßç‡¶§‡ßã‡¶∞‡¶æ‡¶Å‡¶Ø‡¶º ‡¶Ü‡¶∞ ‡¶ï‡¶ñ‡¶®‡ßã ‡¶Ø‡¶æ‡¶¨ ‡¶®‡¶æ"     # negative
]

onnx_fp32_path = DEPLOY_DIR / "student_minilm_fp32.onnx"
onnx_int8_path = DEPLOY_DIR / "student_minilm_int8.onnx"

# ---------------------------------------------------------------------
#  üîπ Benchmark helper
# ---------------------------------------------------------------------
def benchmark_onnx(model_path, text_list, runs=20):
    sess = ort.InferenceSession(str(model_path), providers=["CPUExecutionProvider"])
    inputs = tokenizer(text_list, return_tensors="np", padding="max_length", truncation=True, max_length=128)
    start = time.time()
    for _ in range(runs):
        _ = sess.run(["logits"], {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        })
    end = time.time()
    return (end - start) / runs * 1000  # average latency (ms)

# ---------------------------------------------------------------------
#  üîπ Run latency benchmarks
# ---------------------------------------------------------------------
lat_onnx_fp32 = benchmark_onnx(onnx_fp32_path, texts)
lat_onnx_int8 = benchmark_onnx(onnx_int8_path, texts)

# ---------------------------------------------------------------------
#  üîπ Inference results (for demonstration)
# ---------------------------------------------------------------------
def predict_onnx(model_path, text):
    sess = ort.InferenceSession(str(model_path), providers=["CPUExecutionProvider"])
    inputs = tokenizer(text, return_tensors="np", padding="max_length", truncation=True, max_length=128)
    logits = sess.run(["logits"], {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
    })[0]
    pred = np.argmax(logits, axis=-1).item()
    return pred, logits

for t in texts:
    p_fp32, l_fp32 = predict_onnx(onnx_fp32_path, t)
    p_int8, l_int8 = predict_onnx(onnx_int8_path, t)
    print(f"\nüìù Text: {t}")
    print(f"ONNX FP32 ‚Üí Pred: {p_fp32}, Logits: {np.round(l_fp32, 3)}")
    print(f"ONNX INT8 ‚Üí Pred: {p_int8}, Logits: {np.round(l_int8, 3)}")

# ---------------------------------------------------------------------
#  üîπ Summary Table
# ---------------------------------------------------------------------
data = {
    "Model": ["ONNX FP32", "ONNX INT8"],
    "Size (MB)": [86.78, 21.98],      # from your previous results
    "Latency (ms)": [lat_onnx_fp32, lat_onnx_int8],
    "Notes": [
        "Baseline full-precision export",
        "Quantized deployment (fast + compact)"
    ]
}

summary = pd.DataFrame(data)
print("\nüèÅ Deployment Performance Summary:")
display(summary)



üìù Text: ‡¶è‡¶ü‡¶æ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£ ‡¶Ö‡¶≠‡¶ø‡¶ú‡ßç‡¶û‡¶§‡¶æ!
ONNX FP32 ‚Üí Pred: 1, Logits: [[-2.515  2.388]]
ONNX INT8 ‚Üí Pred: 1, Logits: [[-1.985  1.908]]

üìù Text: ‡¶è‡¶á ‡¶∞‡ßá‡¶∏‡ßç‡¶§‡ßã‡¶∞‡¶æ‡¶Å‡¶Ø‡¶º ‡¶Ü‡¶∞ ‡¶ï‡¶ñ‡¶®‡ßã ‡¶Ø‡¶æ‡¶¨ ‡¶®‡¶æ
ONNX FP32 ‚Üí Pred: 1, Logits: [[-0.994  1.009]]
ONNX INT8 ‚Üí Pred: 1, Logits: [[-0.437  0.485]]

üèÅ Deployment Performance Summary:


Unnamed: 0,Model,Size (MB),Latency (ms),Notes
0,ONNX FP32,86.78,40.381682,Baseline full-precision export
1,ONNX INT8,21.98,33.916891,Quantized deployment (fast + compact)


# 

# Evaluate ONNX FP32 vs INT8 Model Accuracy

In [59]:
import onnxruntime as ort
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

# ------------------------------
#  Setup ONNX sessions
# ------------------------------
sess_fp32 = ort.InferenceSession(str(onnx_fp32_path), providers=["CPUExecutionProvider"])
sess_int8 = ort.InferenceSession(str(onnx_int8_path), providers=["CPUExecutionProvider"])

y_true, y_fp32, y_int8 = [], [], []

# ------------------------------
#  Loop over test set
# ------------------------------
for batch in tqdm(test_loader, desc="Evaluating ONNX Models"):
    # move tensors to CPU and convert to numpy
    input_ids = batch["s_input_ids"].cpu().numpy()
    attention_mask = batch["s_attention_mask"].cpu().numpy()
    labels = batch["labels"].cpu().numpy()

    y_true.extend(labels)

    # FP32 inference
    logits_fp32 = sess_fp32.run(["logits"], {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })[0]
    preds_fp32 = np.argmax(logits_fp32, axis=-1)
    y_fp32.extend(preds_fp32)

    # INT8 inference
    logits_int8 = sess_int8.run(["logits"], {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })[0]
    preds_int8 = np.argmax(logits_int8, axis=-1)
    y_int8.extend(preds_int8)

# ------------------------------
#  Compute accuracy & F1
# ------------------------------
acc_fp32 = accuracy_score(y_true, y_fp32)
f1_fp32 = f1_score(y_true, y_fp32, average="macro")

acc_int8 = accuracy_score(y_true, y_int8)
f1_int8 = f1_score(y_true, y_int8, average="macro")

print("‚úÖ ONNX Model Accuracy Comparison")
print(f"ONNX FP32 ‚Üí Accuracy: {acc_fp32:.4f} | Macro F1: {f1_fp32:.4f}")
print(f"ONNX INT8 ‚Üí Accuracy: {acc_int8:.4f} | Macro F1: {f1_int8:.4f}")
print(f"Œî Accuracy: {(acc_int8 - acc_fp32)*100:.2f}% | Œî F1: {(f1_int8 - f1_fp32)*100:.2f}%")


Evaluating ONNX Models:   0%|          | 0/74 [00:00<?, ?it/s]

‚úÖ ONNX Model Accuracy Comparison
ONNX FP32 ‚Üí Accuracy: 0.9458 | Macro F1: 0.9318
ONNX INT8 ‚Üí Accuracy: 0.9416 | Macro F1: 0.9252
Œî Accuracy: -0.42% | Œî F1: -0.66%


In [None]:
import os, time, torch, numpy as np, pandas as pd, onnxruntime as ort
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ==========================================================
# üîß CONFIGURATION
# ==========================================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {DEVICE.upper()}")

base_model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Your model directories
model_paths = {
    "Teacher (BanglaBERT)": "/kaggle/input/teacher_checkpoint",
    "Student (KD MiniLM)": "/kaggle/working/student_minilm_kd_best",
    "Pruned 30%": "/kaggle/working/student_minilm_pruned",
    "KD-Recovered": "/kaggle/working/student_minilm_proj_kd_recovered_gpu",
    "Quantized (INT8)": "/kaggle/working/student_minilm_proj_kd_recovered_gpu/minilm_quantized.pt",
    "ONNX FP32": "/kaggle/working/student_minilm_proj_kd_recovered_gpu/student_minilm_fp32.onnx",
    "ONNX INT8": "/kaggle/working/student_minilm_proj_kd_recovered_gpu/student_minilm_int8.onnx"
}

texts = [
    "‡¶è‡¶ü‡¶æ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£ ‡¶Ö‡¶≠‡¶ø‡¶ú‡ßç‡¶û‡¶§‡¶æ!",
    "‡¶è‡¶á ‡¶∏‡¶ø‡¶®‡ßá‡¶Æ‡¶æ‡¶ü‡¶æ ‡¶è‡¶ï‡¶¶‡¶Æ ‡¶¨‡¶æ‡¶ú‡ßá ‡¶õ‡¶ø‡¶≤‡•§",
    "‡¶ó‡¶≤‡ßç‡¶™‡¶ü‡¶æ ‡¶ñ‡ßÅ‡¶¨ ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞‡¶≠‡¶æ‡¶¨‡ßá ‡¶â‡¶™‡¶∏‡ßç‡¶•‡¶æ‡¶™‡¶® ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§",
    "‡¶®‡¶æ‡¶ü‡¶ï‡¶ü‡¶æ‡¶∞ ‡¶ó‡¶≤‡ßç‡¶™ ‡¶è‡¶ï‡¶¶‡¶Æ ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡¶≤, ‡¶ï‡ßã‡¶®‡ßã ‡¶∏‡¶Ç‡¶Ø‡ßã‡¶ó‡¶á ‡¶®‡ßá‡¶á‡•§"
]

# Tokenize for PyTorch models
inputs_pt = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

# ==========================================================
# üß© HELPER FUNCTIONS
# ==========================================================
def get_size(path):
    """Compute model file size in MB."""
    if os.path.isdir(path):
        total = sum(os.path.getsize(os.path.join(root, f)) for root, _, files in os.walk(path) for f in files)
        return round(total / (1024 * 1024), 2)
    elif os.path.exists(path):
        return round(os.path.getsize(path) / (1024 * 1024), 2)
    return None

def benchmark_pytorch(model, inputs, runs=20):
    """Benchmark latency for PyTorch model."""
    model.eval()
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        for _ in range(3):  # warmup
            _ = model(**inputs)
        start = time.time()
        for _ in range(runs):
            _ = model(**inputs)
        elapsed = time.time() - start
    avg_ms = (elapsed / runs) * 1000
    throughput = (len(texts) * runs) / elapsed
    return round(avg_ms, 2), round(throughput, 2)

def benchmark_onnx(model_path, texts, runs=20):
    """Benchmark latency for ONNX model."""
    sess = ort.InferenceSession(str(model_path), providers=["CPUExecutionProvider"])
    total_time = 0
    for _ in range(runs):
        for text in texts:
            inputs = tokenizer(text, return_tensors="np", padding="max_length", truncation=True, max_length=128)
            start = time.time()
            _ = sess.run(["logits"], {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})
            total_time += (time.time() - start)
    avg_ms = (total_time / (len(texts) * runs)) * 1000
    throughput = (len(texts) * runs) / total_time
    return round(avg_ms, 2), round(throughput, 2)

# ==========================================================
# üöÄ BENCHMARK LOOP
# ==========================================================
results = []

for name, path in model_paths.items():
    print(f"\nüîç Evaluating: {name}")
    try:
        if name.startswith("ONNX"):
            latency, throughput = benchmark_onnx(path, texts)
        elif path.endswith(".pt"):
            # Quantized PyTorch model
            state_dict = torch.load(path, map_location=DEVICE)
            model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
            model.load_state_dict(state_dict, strict=False)
            model.to(DEVICE)
            latency, throughput = benchmark_pytorch(model, inputs_pt)
        else:
            model = AutoModelForSequenceClassification.from_pretrained(path)
            model.to(DEVICE)
            latency, throughput = benchmark_pytorch(model, inputs_pt)

        results.append({
            "Model Variant": name,
            "Size (MB)": get_size(path),
            "Latency (ms/sample)": latency,
            "Throughput (samples/s)": throughput,
            "Notes": "‚úì OK"
        })
    except Exception as e:
        print(f"‚ö†Ô∏è {name} skipped: {e}")
        results.append({
            "Model Variant": name,
            "Size (MB)": get_size(path),
            "Latency (ms/sample)": None,
            "Throughput (samples/s)": None,
            "Notes": f"Error: {str(e)[:60]}"
        })

# ==========================================================
# üìä TABLE SUMMARY
# ==========================================================
summary = pd.DataFrame(results)
summary = summary.sort_values("Latency (ms/sample)", ascending=False)

print("\nüèÅ Model Benchmark Summary:")
display(summary.style.set_properties(**{'text-align': 'center'}))
