In [1]:
!pip install --upgrade --no-cache-dir transformers==4.45.2 indic-transliteration -q
!pip install --no-cache-dir scikit-learn pandas tqdm matplotlib sentencepiece -q

import os, json, random, numpy as np, torch
from pathlib import Path

# ---- Repro / device / dirs
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WORK_DIR = Path("/kaggle/working"); WORK_DIR.mkdir(parents=True, exist_ok=True)

# ---- Config
TEACHER_MODEL_ID = "csebuetnlp/banglabert"
STUDENT_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_LEN = 128
BATCH_SIZE = 16

# Teacher FT
EPOCHS_TEACHER = 3
LR_TEACHER = 2e-5
WARMUP_RATIO_T = 0.1
WEIGHT_DECAY_T = 0.01

# KD
EPOCHS_STUDENT = 5
LR_STUDENT = 3e-5
WARMUP_RATIO_S = 0.1
WEIGHT_DECAY_S = 0.01
PATIENCE = 2

KD_T = 3.0
KD_ALPHA = 0.5
GAMMA_HIDDEN = 1.0

print("‚úÖ Device:", DEVICE)


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.4/44.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.9/9.9 MB[0m [31m120.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m159.6/159.6 kB[0m [31m317.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m566.1/566.1 kB[0m [31m328.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.0/3.0 MB[0m [31m294.0 MB/s[0m eta [36m0

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

DATA_DIR = Path("/kaggle/input/dataaaaaaa")
POS_FILE = DATA_DIR / "all_positive_8500.txt"
NEG_FILE = DATA_DIR / "all_negative_3307.txt"
assert POS_FILE.exists() and NEG_FILE.exists(), f"Missing data: {POS_FILE}, {NEG_FILE}"

def read_txt(p: Path):
    with open(p, encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

pos, neg = read_txt(POS_FILE), read_txt(NEG_FILE)
df = pd.DataFrame({"text": pos + neg, "label": [1]*len(pos) + [0]*len(neg)}).sample(frac=1, random_state=SEED)

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=SEED)
val_df,   test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=SEED)
print(f"Train={len(train_df)} | Val={len(val_df)} | Test={len(test_df)}")


Train=9445 | Val=1181 | Test=1181


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

class TxtClsDataset(Dataset):
    def __init__(self, df, tok, max_len):
        self.texts = df.text.tolist(); self.labels = df.label.tolist()
        self.tok, self.max_len = tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(self.texts[i], truncation=True, padding="max_length",
                       max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[i], dtype=torch.long)
        }

teacher_tok = AutoTokenizer.from_pretrained(TEACHER_MODEL_ID)
teacher = AutoModelForSequenceClassification.from_pretrained(TEACHER_MODEL_ID, num_labels=2).to(DEVICE)

tr_loader = DataLoader(TxtClsDataset(train_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
va_loader = DataLoader(TxtClsDataset(val_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)
te_loader = DataLoader(TxtClsDataset(test_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)

opt = AdamW(teacher.parameters(), lr=LR_TEACHER, weight_decay=WEIGHT_DECAY_T)
steps = len(tr_loader) * EPOCHS_TEACHER
sch = get_linear_schedule_with_warmup(opt, int(WARMUP_RATIO_T*steps), steps)
criterion = torch.nn.CrossEntropyLoss()

best_f1 = -1
for ep in range(1, EPOCHS_TEACHER+1):
    teacher.train(); total = 0
    for b in tqdm(tr_loader, desc=f"Teacher Epoch {ep}/{EPOCHS_TEACHER}"):
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = teacher(**b)
        loss = out.loss
        loss.backward()
        opt.step(); sch.step(); opt.zero_grad()
        total += loss.item()

    teacher.eval(); preds, gold = [], []
    with torch.no_grad():
        for b in va_loader:
            b = {k:v.to(DEVICE) for k,v in b.items()}
            out = teacher(**b)
            preds += out.logits.argmax(-1).cpu().tolist()
            gold  += b["labels"].cpu().tolist()
    acc = accuracy_score(gold, preds)
    f1m = f1_score(gold, preds, average="macro")
    print(f"Val: Acc={acc:.4f} | F1_macro={f1m:.4f}")
    if f1m > best_f1:
        best_f1 = f1m
        save_dir = WORK_DIR / "finetuned_banglabert"
        save_dir.mkdir(parents=True, exist_ok=True)
        teacher.save_pretrained(save_dir)
        teacher_tok.save_pretrained(save_dir)
        print("üíæ Saved best BanglaBERT teacher.")

# quick test
teacher.eval(); preds, gold = [], []
with torch.no_grad():
    for b in te_loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = teacher(**b)
        preds += out.logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
print("‚úÖ BanglaBERT Teacher [Test]: Acc={:.4f} | F1_macro={:.4f}".format(
    accuracy_score(gold, preds), f1_score(gold, preds, average="macro")))


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Teacher Epoch 1/3:   0%|          | 0/591 [00:00<?, ?it/s]

Val: Acc=0.9577 | F1_macro=0.9489
üíæ Saved best BanglaBERT teacher.


Teacher Epoch 2/3:   0%|          | 0/591 [00:00<?, ?it/s]

Val: Acc=0.9670 | F1_macro=0.9586
üíæ Saved best BanglaBERT teacher.


Teacher Epoch 3/3:   0%|          | 0/591 [00:00<?, ?it/s]

Val: Acc=0.9678 | F1_macro=0.9601
üíæ Saved best BanglaBERT teacher.
‚úÖ BanglaBERT Teacher [Test]: Acc=0.9687 | F1_macro=0.9611


# KD Data (Transliteration, 2 tokenizers)

In [4]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

def transliterate_bn_text(txt: str) -> str:
    try:
        return transliterate(txt, sanscript.BENGALI, sanscript.ITRANS)
    except Exception:
        return txt

from transformers import AutoTokenizer
student_tok = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

class KDDataset(Dataset):
    def __init__(self, df, t_tok, s_tok, max_len):
        self.texts = df.text.tolist(); self.labels = df.label.tolist()
        self.ttok, self.stok, self.max_len = t_tok, s_tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        txt_bn = self.texts[i]; txt_en = transliterate_bn_text(txt_bn)
        t = self.ttok(txt_bn, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        s = self.stok(txt_en, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "t_input_ids": t["input_ids"].squeeze(0),
            "t_attention_mask": t["attention_mask"].squeeze(0),
            "s_input_ids": s["input_ids"].squeeze(0),
            "s_attention_mask": s["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[i], dtype=torch.long)
        }

def pad_collate(batch, t_pad, s_pad):
    out = {}
    for k in batch[0]:
        if k == "labels": out[k] = torch.stack([b[k] for b in batch])
        elif k.startswith("t_"):
            padv = 0 if "attention" in k else t_pad
            out[k] = nn.utils.rnn.pad_sequence([b[k] for b in batch], batch_first=True, padding_value=padv)
        elif k.startswith("s_"):
            padv = 0 if "attention" in k else s_pad
            out[k] = nn.utils.rnn.pad_sequence([b[k] for b in batch], batch_first=True, padding_value=padv)
    return out

train_loader = DataLoader(KDDataset(train_df, teacher_tok, student_tok, MAX_LEN),
                          batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=lambda b: pad_collate(b, teacher_tok.pad_token_id, student_tok.pad_token_id))
val_loader = DataLoader(KDDataset(val_df, teacher_tok, student_tok, MAX_LEN),
                        batch_size=BATCH_SIZE,
                        collate_fn=lambda b: pad_collate(b, teacher_tok.pad_token_id, student_tok.pad_token_id))
test_loader = DataLoader(KDDataset(test_df, teacher_tok, student_tok, MAX_LEN),
                         batch_size=BATCH_SIZE,
                         collate_fn=lambda b: pad_collate(b, teacher_tok.pad_token_id, student_tok.pad_token_id))
print("‚úÖ KD dataloaders ready (BanglaBERT‚ÜíMiniLM).")


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

‚úÖ KD dataloaders ready (BanglaBERT‚ÜíMiniLM).


# Student (MiniLM-L6-v2) head (logits + hidden)

In [5]:
import torch.nn as nn
from transformers import AutoModel

class StudentClassifier(nn.Module):
    def __init__(self, base_model_id, num_labels=2, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_id)
        s_H = self.encoder.config.hidden_size            # MiniLM hidden size (often 384)
        self.s_hidden = s_H
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(s_H, num_labels)
    def forward(self, input_ids=None, attention_mask=None, **_):
        out = self.encoder(input_ids=input_ids,
                           attention_mask=attention_mask,
                           output_hidden_states=True,
                           return_dict=True)
        cls = out.last_hidden_state[:, 0, :]
        logits = self.fc(self.dropout(cls))
        return {"logits": logits, "hidden_states": out.hidden_states}

student = StudentClassifier(STUDENT_MODEL_ID).to(DEVICE)
print("‚úÖ Student initialized. Hidden size =", student.s_hidden)


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

‚úÖ Student initialized. Hidden size = 384


# KD Projection + Loss (CE + KL + HiddenProj)

In [6]:
import torch.nn.functional as F

# discover sizes
t_hidden = teacher.config.hidden_size    # XLM-R = 768
s_hidden = student.s_hidden              # MiniLM-L6-v2 = 384

class KDProjectionHead(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.bridge = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            nn.GELU(),
            nn.LayerNorm(out_dim)
        )
    def forward(self, x):
        return self.bridge(x)

proj_head = KDProjectionHead(s_hidden, t_hidden).to(DEVICE)

class KDLossProj(nn.Module):
    def __init__(self, T=3.0, alpha=0.5, gamma_h=1.0):
        super().__init__()
        self.T, self.alpha, self.gamma_h = T, alpha, gamma_h
        self.ce  = nn.CrossEntropyLoss()
        self.kld = nn.KLDivLoss(reduction="batchmean")
        self.mse = nn.MSELoss()

    @staticmethod
    def map_layers(n_s, n_t):
        # skip embeddings index 0; map hidden layers 1..n
        s_idx = list(range(1, n_s))  # student hidden_states length includes embeddings at 0
        t_idx = torch.linspace(1, n_t-1, steps=len(s_idx)).round().long().tolist()
        return list(zip(s_idx, t_idx))

    def forward(self, s_pack, t_pack, labels):
        # logits: CE + KL
        logits_s, logits_t = s_pack["logits"], t_pack["logits"]
        hard = self.ce(logits_s, labels)
        soft = self.kld(F.log_softmax(logits_s/self.T, dim=-1),
                        F.softmax(logits_t/self.T,  dim=-1)) * (self.T**2)
        loss = (1 - self.alpha)*hard + self.alpha*soft

        # hidden: MSE(proj(student_h), teacher_h) with proportional mapping
        hs, ht = s_pack.get("hidden_states", []), t_pack.get("hidden_states", [])
        if hs and ht:
            pairs = self.map_layers(len(hs), len(ht))
            h_losses = []
            for i_s, i_t in pairs:
                s_h = proj_head(hs[i_s])           # [B, L, t_hidden]
                t_h = ht[i_t]
                L = min(s_h.size(1), t_h.size(1))
                h_losses.append(self.mse(s_h[:, :L, :], t_h[:, :L, :]))
            if h_losses:
                loss = loss + GAMMA_HIDDEN * torch.stack(h_losses).mean()

        return loss

criterion = KDLossProj(T=KD_T, alpha=KD_ALPHA, gamma_h=GAMMA_HIDDEN)
print("‚úÖ KD loss & projection ready. (student‚Üíteacher dims: {}‚Üí{})".format(s_hidden, t_hidden))


‚úÖ KD loss & projection ready. (student‚Üíteacher dims: 384‚Üí768)


# KD Training (teacher frozen)

In [7]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# freeze teacher for KD
teacher.eval()
for p in teacher.parameters(): p.requires_grad = False

opt = AdamW(list(student.parameters()) + list(proj_head.parameters()),
            lr=LR_STUDENT, weight_decay=WEIGHT_DECAY_S)
num_steps = EPOCHS_STUDENT * len(train_loader)
sch = get_linear_schedule_with_warmup(opt, int(WARMUP_RATIO_S * num_steps), num_steps)

def metrics(preds, gold):
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted"),
    }

@torch.no_grad()
def eval_student(loader):
    student.eval(); proj_head.eval()
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = student(input_ids=b["s_input_ids"], attention_mask=b["s_attention_mask"])
        preds += out["logits"].argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return metrics(np.array(preds), np.array(gold))

best_f1, wait = -1.0, 0

for ep in range(1, EPOCHS_STUDENT+1):
    student.train(); proj_head.train()
    run = 0.0

    for b in tqdm(train_loader, desc=f"[KD Epoch {ep}/{EPOCHS_STUDENT}]"):
        labels = b["labels"].to(DEVICE)

        s_out = student(input_ids=b["s_input_ids"].to(DEVICE),
                        attention_mask=b["s_attention_mask"].to(DEVICE),
                        )

        with torch.no_grad():
            t_raw = teacher(input_ids=b["t_input_ids"].to(DEVICE),
                            attention_mask=b["t_attention_mask"].to(DEVICE),
                            output_hidden_states=True,
                            return_dict=True)
            t_out = {"logits": t_raw.logits, "hidden_states": t_raw.hidden_states}

        loss = criterion(s_out, t_out, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(list(student.parameters()) + list(proj_head.parameters()), 1.0)
        opt.step(); sch.step(); opt.zero_grad()
        run += loss.item()

    val = eval_student(val_loader)
    print(f"[KD] loss={run/len(train_loader):.4f} | Val Acc={val['accuracy']:.4f} | "
          f"F1m={val['f1_macro']:.4f} | F1w={val['f1_weighted']:.4f}")

    if val["f1_macro"] > best_f1:
        best_f1, wait = val["f1_macro"], 0
        torch.save({"student": student.state_dict(), "proj": proj_head.state_dict()},
                   WORK_DIR / "student_minilm_kd_best.pt")
        print("üíæ Saved best student.")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("‚è∏Ô∏è Early stopping.")
            break

# reload best
ckpt = torch.load(WORK_DIR / "student_minilm_kd_best.pt", map_location=DEVICE)
student.load_state_dict(ckpt["student"]); proj_head.load_state_dict(ckpt["proj"])
student.eval(); proj_head.eval()
print("‚úÖ KD complete & best reloaded.")




[KD Epoch 1/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=1.8668 | Val Acc=0.9170 | F1m=0.8952 | F1w=0.9162
üíæ Saved best student.


[KD Epoch 2/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=1.0859 | Val Acc=0.9136 | F1m=0.8837 | F1w=0.9096


[KD Epoch 3/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.9249 | Val Acc=0.9399 | F1m=0.9247 | F1w=0.9396
üíæ Saved best student.


[KD Epoch 4/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.8058 | Val Acc=0.9458 | F1m=0.9331 | F1w=0.9459
üíæ Saved best student.


[KD Epoch 5/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.7441 | Val Acc=0.9450 | F1m=0.9322 | F1w=0.9451
‚úÖ KD complete & best reloaded.


# Test Metrics + Alignment + Save

In [9]:
from scipy.special import softmax
from scipy.spatial.distance import cosine

@torch.no_grad()
def eval_model(model, loader, mode="student"):
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        inp = {"input_ids": b["t_input_ids"], "attention_mask": b["t_attention_mask"]} if mode=="teacher" else \
              {"input_ids": b["s_input_ids"], "attention_mask": b["s_attention_mask"]}
        out = model(**inp)
        logits = out["logits"] if isinstance(out, dict) else out.logits
        preds += logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted")
    }

print("üß™ Evaluating on test‚Ä¶")
teacher_test = eval_model(teacher, test_loader, mode="teacher")
student_test = eval_model(student, test_loader, mode="student")
print("[Teacher][Test]:", teacher_test)
print("[Student][Test]:", student_test)

@torch.no_grad()
def alignment_metrics(teacher, student, loader):
    cos_list, corr_list, agree = [], [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        t = teacher(b["t_input_ids"], b["t_attention_mask"])
        s = student(b["s_input_ids"], b["s_attention_mask"])
        t_logits = t.logits.detach().cpu().numpy()
        s_logits = s["logits"].detach().cpu().numpy()
        t_probs  = softmax(t_logits, axis=-1)
        s_probs  = softmax(s_logits, axis=-1)
        for tl, sl, tp, sp in zip(t_logits, s_logits, t_probs, s_probs):
            cos_list.append(1 - cosine(tl, sl))
            corr_list.append(np.corrcoef(tp, sp)[0, 1])
            agree.append(np.argmax(tp) == np.argmax(sp))
    return {
        "logit_cosine": float(np.nanmean(cos_list)),
        "prob_corr": float(np.nanmean(corr_list)),
        "pred_alignment": float(np.mean(agree))
    }

align = alignment_metrics(teacher, student, test_loader)
print(f"""
üß© Alignment (Test)
  ‚Ä¢ Logit cosine : {align['logit_cosine']:.4f}
  ‚Ä¢ Prob corr    : {align['prob_corr']:.4f}
  ‚Ä¢ Agreement    : {align['pred_alignment']:.4f}
""")

# ---- Save artifacts
SAVE_DIR = WORK_DIR / "student_minilm_translit_kd_proj"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
torch.save(student.state_dict(), SAVE_DIR / "pytorch_model.bin")
from transformers import AutoTokenizer
# student tokenizer saving
AutoTokenizer.from_pretrained(STUDENT_MODEL_ID).save_pretrained(SAVE_DIR)

meta = {
    "teacher_model": TEACHER_MODEL_ID,
    "student_model": STUDENT_MODEL_ID,
    "kd_temperature": KD_T,
    "alpha": KD_ALPHA,
    "gamma_hidden": GAMMA_HIDDEN,
    "max_len": MAX_LEN,
    "lr_student": LR_STUDENT,
    "epochs_student": EPOCHS_STUDENT
}
json.dump(meta, open(SAVE_DIR / "student_config.json", "w"), indent=2, ensure_ascii=False)

def to_py(o):
    if isinstance(o, dict): return {k: to_py(v) for k,v in o.items()}
    if hasattr(o, "item"): return o.item()
    return o

json.dump({"teacher_test": to_py(teacher_test),
           "student_test": to_py(student_test),
           "alignment": to_py(align)},
          open(WORK_DIR / "metrics_minilm_kd_proj.json", "w"), indent=2, ensure_ascii=False)

print("‚úÖ Saved student + metrics to:", SAVE_DIR, "and", WORK_DIR / "metrics_minilm_kd_proj.json")


üß™ Evaluating on test‚Ä¶
[Teacher][Test]: {'accuracy': 0.9686706181202371, 'f1_macro': 0.961136147554033, 'f1_weighted': 0.9686561287537637}
[Student][Test]: {'accuracy': 0.9458086367485182, 'f1_macro': 0.9329615280911632, 'f1_weighted': 0.9458583354280438}

üß© Alignment (Test)
  ‚Ä¢ Logit cosine : 0.9036
  ‚Ä¢ Prob corr    : 0.9069
  ‚Ä¢ Agreement    : 0.9534

‚úÖ Saved student + metrics to: /kaggle/working/student_minilm_translit_kd_proj and /kaggle/working/metrics_minilm_kd_proj.json
