In [1]:
!pip -q install -U transformers accelerate indic-transliteration scikit-learn
import torch, sys
print("Torch:", torch.__version__, "| CUDA:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.0/12.0 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m375.8/375.8 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m159.6/159.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.7/9.7 MB[0m [31m105.5 MB/s[0m eta 

# Imports, Config

In [2]:
# ==============================================================
# ‚öôÔ∏è Configuration + Imports (Full KD: logits + hidden + attention)
# ==============================================================

import os, json, random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW   # ‚úÖ correct import (transformers >= 4.46)

# ----------------------------
# üîπ Reproducibility + Device
# ----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {DEVICE}")

# ----------------------------
# üîπ Paths
# ----------------------------
INPUT_DIR = Path("/kaggle/input")
WORK_DIR  = Path("/kaggle/working")
WORK_DIR.mkdir(exist_ok=True)

# ----------------------------
# üîπ Model IDs
# ----------------------------
TEACHER_MODEL_ID = "csebuetnlp/banglabert"       # Teacher: BanglaBERT
STUDENT_MODEL_ID = "distilbert-base-multilingual-cased"     # Student: DistilBERT (shared tokenizer)

# ----------------------------
# üîπ Training parameters
# ----------------------------
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS_TEACHER = 3
EPOCHS_STUDENT = 4
LR_TEACHER = 2e-5
LR_STUDENT = 3e-5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
PATIENCE = 2

# ----------------------------
# üîπ Knowledge Distillation hyperparameters
# ----------------------------
KD_T = 3.0          # temperature for softening logits
KD_ALPHA = 0.5      # weight for soft vs hard supervision
GAMMA_HIDDEN = 1.0  # layer-wise hidden-state distillation weight
GAMMA_ATT   = 1.0   # üî• new: attention-map distillation weight

print(f"""
KD configuration:
  Temperature (T) .......... {KD_T}
  Alpha (KL vs CE) ......... {KD_ALPHA}
  Hidden-state MSE weight .. {GAMMA_HIDDEN}
  Attention-map MSE weight . {GAMMA_ATT}
""")

# ----------------------------
# üîπ Utility: metrics
# ----------------------------
def compute_metrics(preds, labels):
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
    }


‚úÖ Using device: cuda

KD configuration:
  Temperature (T) .......... 3.0
  Alpha (KL vs CE) ......... 0.5
  Hidden-state MSE weight .. 1.0
  Attention-map MSE weight . 1.0



In [3]:
# üîç Inspect Kaggle input directories
import os
for root, dirs, files in os.walk("/kaggle/input"):
    print(root)
    for f in files:
        print("   ", f)


/kaggle/input
    all_negative_3307.txt
    all_positive_8500.txt


# Load pos.txt / neg.txt from /kaggle/input and split

In [4]:
def try_paths():
    if (INPUT_DIR / "all_positive_8500.txt").exists() and (INPUT_DIR / "all_negative_3307.txt").exists():
        return INPUT_DIR
    # fallback: scan /kaggle/input for the files
    for root, dirs, files in os.walk("/kaggle/input"):
        if "all_positive_8500.txt" in files and "all_negative_3307.txt" in files:
            return Path(root)
    raise FileNotFoundError("Could not find all_positive_8500.txt and all_negative_3307.txt under /kaggle/input")

DATA_DIR = try_paths()
print("Using data dir:", DATA_DIR)

pos_path = DATA_DIR / "all_positive_8500.txt"
neg_path = DATA_DIR / "all_negative_3307.txt"

def read_lines(p: Path):
    with open(p, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

pos = read_lines(pos_path)
neg = read_lines(neg_path)
print(f"Loaded {len(pos)} positive, {len(neg)} negative")

df = pd.concat([
    pd.DataFrame({"text": pos, "label": 1}),
    pd.DataFrame({"text": neg, "label": 0})
], ignore_index=True).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

train_df, tmp_df = train_test_split(df, test_size=0.2, stratify=df.label, random_state=SEED)
val_df,   test_df = train_test_split(tmp_df, test_size=0.5, stratify=tmp_df.label, random_state=SEED)
print(f"Train/Val/Test: {len(train_df)}/{len(val_df)}/{len(test_df)}")


Using data dir: /kaggle/input
Loaded 8500 positive, 3307 negative
Train/Val/Test: 9445/1181/1181


In [5]:
# üîÅ Shared BanglaBERT tokenizer for BOTH teacher & student
shared_tok = AutoTokenizer.from_pretrained(TEACHER_MODEL_ID)

class SimpleDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df.text.tolist()
        self.labels = df.label.tolist()
        self.tk = tokenizer; self.max_len = max_len
    def __len__(self): return len(self.labels)
    def __getitem__(self, i):
        enc = self.tk(self.texts[i], truncation=True, max_length=self.max_len, return_tensors="pt")
        item = {k:v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(self.labels[i], dtype=torch.long)
        return item

def pad_collate(batch, pad_id):
    keys = batch[0].keys()
    out = {}
    for k in keys:
        if k == "labels":
            out[k] = torch.stack([b[k] for b in batch])
        else:
            out[k] = nn.utils.rnn.pad_sequence([b[k] for b in batch], batch_first=True,
                                               padding_value=(pad_id if k!="attention_mask" else 0))
    return out

train_loader = DataLoader(SimpleDataset(train_df, shared_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=lambda b: pad_collate(b, shared_tok.pad_token_id))
val_loader   = DataLoader(SimpleDataset(val_df, shared_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=lambda b: pad_collate(b, shared_tok.pad_token_id))
test_loader  = DataLoader(SimpleDataset(test_df, shared_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=lambda b: pad_collate(b, shared_tok.pad_token_id))


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Datasets & Tokenizers (Teacher, Student, Paired)

# Train teacher (BanglaBERT) and save best

In [6]:
teacher = AutoModelForSequenceClassification.from_pretrained(TEACHER_MODEL_ID, num_labels=2).to(DEVICE)

def evaluate_cls(model, loader):
    model.eval(); preds, gold = [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k:v.to(DEVICE) for k,v in batch.items()}
            out = model(**{k:v for k,v in batch.items() if k!="labels"})
            preds += out.logits.argmax(-1).detach().cpu().tolist()
            gold  += batch["labels"].detach().cpu().tolist()
    return compute_metrics(np.array(preds), np.array(gold))

num_steps = EPOCHS_TEACHER * len(train_loader)
warm_steps = int(WARMUP_RATIO * num_steps)
opt_t = AdamW(teacher.parameters(), lr=LR_TEACHER, weight_decay=WEIGHT_DECAY)
sch_t = get_linear_schedule_with_warmup(opt_t, warm_steps, num_steps)

best_f1, wait = -1, 0
for ep in range(1, EPOCHS_TEACHER+1):
    teacher.train(); total = 0.0
    for b in tqdm(train_loader, desc=f"[Teacher] Epoch {ep}", leave=False):
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = teacher(**b)
        loss = out.loss
        loss.backward()
        nn.utils.clip_grad_norm_(teacher.parameters(), 1.0)
        opt_t.step(); sch_t.step(); opt_t.zero_grad()
        total += loss.item()
    val = evaluate_cls(teacher, val_loader)
    print(f"[Teacher] loss={total/len(train_loader):.4f} | Val F1m={val['f1_macro']:.4f}")
    if val["f1_macro"] > best_f1:
        best_f1, wait = val["f1_macro"], 0
        teacher.save_pretrained(WORK_DIR / "teacher_model_sharedtok")
        shared_tok.save_pretrained(WORK_DIR / "teacher_model_sharedtok")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("Early stop teacher.")
            break

# reload best & test
teacher = AutoModelForSequenceClassification.from_pretrained(WORK_DIR / "teacher_model_sharedtok").to(DEVICE)
teacher_test = evaluate_cls(teacher, test_loader)
print("[Teacher][Test]:", teacher_test)


2025-11-01 09:21:50.468119: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761988910.691986      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761988910.754245      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

[Teacher] Epoch 1:   0%|          | 0/591 [00:00<?, ?it/s]

[Teacher] loss=0.2389 | Val F1m=0.9559


[Teacher] Epoch 2:   0%|          | 0/591 [00:00<?, ?it/s]

[Teacher] loss=0.0880 | Val F1m=0.9488


[Teacher] Epoch 3:   0%|          | 0/591 [00:00<?, ?it/s]

[Teacher] loss=0.0459 | Val F1m=0.9573
[Teacher][Test]: {'accuracy': 0.9678238780694327, 'f1_macro': 0.9601959073041282, 'f1_weighted': 0.9678533866604009}


# Student model + Logit + Hidden + Attention KD loss

In [7]:
# Student encoder with shared tokenizer: resize embeddings to shared vocab
class StudentClassifier(nn.Module):
    def __init__(self, base_model_id, num_labels=2, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_id)
        # resize token embeddings to match shared tokenizer vocab
        self.encoder.resize_token_embeddings(len(shared_tok))
        H = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(H, num_labels)
        # optional: silence SDPA attention warning
        if hasattr(self.encoder, "config"):
            try:
                self.encoder.config.attn_implementation = "eager"
            except Exception:
                pass

    def forward(self, input_ids=None, attention_mask=None,
                output_hidden_states=False, output_attentions=False):
        out = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
            return_dict=True
        )
        cls_vec = out.last_hidden_state[:, 0, :]
        logits = self.classifier(self.dropout(cls_vec))
        return {
            "logits": logits,
            "hidden_states": out.hidden_states if output_hidden_states else None,
            "attentions": out.attentions if output_attentions else None
        }

student = StudentClassifier(STUDENT_MODEL_ID, num_labels=2).to(DEVICE)

# KD Loss: CE + KL (logits) + layer-wise hidden-state MSE + attention-map MSE
class KDLossWithHiddenAtt(nn.Module):
    def __init__(self, T=3.0, alpha=0.5, gamma_h=1.0, gamma_a=1.0):
        super().__init__()
        self.T, self.alpha = T, alpha
        self.gamma_h, self.gamma_a = gamma_h, gamma_a
        self.ce = nn.CrossEntropyLoss()
        self.kld = nn.KLDivLoss(reduction="batchmean")
        self.mse = nn.MSELoss()

    @staticmethod
    def map_layers(n_s, n_t, for_att=False):
        # hidden_states: indices 1..n (ignore embedding 0)
        # attentions: indices 0..(n-1)
        if for_att:
            s_idx = list(range(n_s))
            t_pos = torch.linspace(0, n_t-1, steps=len(s_idx)).round().long().tolist()
        else:
            s_idx = list(range(1, n_s+1))
            t_pos = torch.linspace(1, n_t, steps=len(s_idx)).round().long().tolist()
        return list(zip(s_idx, t_pos))

    def logits_loss(self, s_logits, t_logits, labels):
        hard = self.ce(s_logits, labels)
        log_p_s = torch.log_softmax(s_logits / self.T, dim=-1)
        p_t     = torch.softmax(t_logits / self.T, dim=-1)
        soft = self.kld(log_p_s, p_t) * (self.T ** 2)
        return (1 - self.alpha) * hard + self.alpha * soft, hard.item(), soft.item()

    def hidden_loss(self, hs_s, hs_t):
        # hs_* include embedding at 0
        n_s = len(hs_s) - 1; n_t = len(hs_t) - 1
        pairs = self.map_layers(n_s, n_t, for_att=False)
        losses = []
        for s_i, t_i in pairs:
            s = hs_s[s_i]  # (B, Ls, Hs)
            t = hs_t[t_i]  # (B, Lt, Ht)
            # align sequence length to min
            L = min(s.size(1), t.size(1))
            s = s[:, :L, :]
            t = t[:, :L, :]
            # align hidden dim by pad/trunc (student -> teacher)
            Hs, Ht = s.size(-1), t.size(-1)
            if Hs != Ht:
                if Hs > Ht: s = s[..., :Ht]
                else:
                    pad = torch.zeros(s.size(0), s.size(1), Ht-Hs, device=s.device, dtype=s.dtype)
                    s = torch.cat([s, pad], dim=-1)
            losses.append(self.mse(s, t))
        return torch.stack(losses).mean() if losses else torch.tensor(0.0, device=hs_s[0].device)

    def attention_loss(self, at_s, at_t):
        # at_*: tuples of length n_layers, each (B, H, L, L)
        n_s = len(at_s); n_t = len(at_t)
        pairs = self.map_layers(n_s, n_t, for_att=True)
        losses = []
        for s_i, t_i in pairs:
            s = at_s[s_i]  # (B, Hs, Ls, Ls)
            t = at_t[t_i]  # (B, Ht, Lt, Lt)
            # average heads to avoid head-count mismatch
            s = s.mean(dim=1)  # (B, Ls, Ls)
            t = t.mean(dim=1)  # (B, Lt, Lt)
            # normalize rows (softmax) to stabilize
            s = torch.softmax(s, dim=-1)
            t = torch.softmax(t, dim=-1)
            # align L
            L = min(s.size(-1), t.size(-1))
            s = s[:, :L, :L]
            t = t[:, :L, :L]
            losses.append(self.mse(s, t))
        return torch.stack(losses).mean() if losses else torch.tensor(0.0, device=at_s[0].device)

    def forward(self, s_pack, t_pack, labels):
        # logits loss
        total, hard, soft = self.logits_loss(s_pack["logits"], t_pack["logits"], labels)

        # hidden loss
        h_loss = self.hidden_loss(s_pack["hidden_states"], t_pack["hidden_states"]) if self.gamma_h > 0 else 0.0
        a_loss = self.attention_loss(s_pack["attentions"],    t_pack["attentions"])    if self.gamma_a > 0 else 0.0

        total = total + self.gamma_h * h_loss + self.gamma_a * a_loss
        parts = {
            "hard_ce": hard,
            "soft_kl": soft,
            "hidden_mse": float(h_loss) if isinstance(h_loss, torch.Tensor) else h_loss,
            "attn_mse":   float(a_loss) if isinstance(a_loss, torch.Tensor) else a_loss
        }
        return total, parts

criterion = KDLossWithHiddenAtt(T=KD_T, alpha=KD_ALPHA, gamma_h=GAMMA_HIDDEN, gamma_a=GAMMA_ATT)


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

# KD training loop enabling attentions

In [8]:
# Freeze teacher
teacher.eval()
for p in teacher.parameters(): p.requires_grad = False

opt_s = AdamW(student.parameters(), lr=LR_STUDENT, weight_decay=WEIGHT_DECAY)
num_steps_s = EPOCHS_STUDENT * len(train_loader)
warm_steps_s = int(WARMUP_RATIO * num_steps_s)
sch_s = get_linear_schedule_with_warmup(opt_s, warm_steps_s, num_steps_s)

best_f1, wait = -1, 0

def eval_student(loader):
    student.eval(); preds, gold = [], []
    with torch.no_grad():
        for b in loader:
            b = {k:v.to(DEVICE) for k,v in b.items()}
            out = student(input_ids=b["input_ids"], attention_mask=b["attention_mask"])
            preds += out["logits"].argmax(-1).detach().cpu().tolist()
            gold  += b["labels"].detach().cpu().tolist()
    return compute_metrics(np.array(preds), np.array(gold))

for ep in range(1, EPOCHS_STUDENT+1):
    student.train(); run_loss = 0.0
    for b in tqdm(train_loader, desc=f"[Student KD] Epoch {ep}", leave=False):
        labels = b["labels"].to(DEVICE)

        # Student forward (need logits + hidden + attentions)
        s_out = student(
            input_ids=b["input_ids"].to(DEVICE),
            attention_mask=b["attention_mask"].to(DEVICE),
            output_hidden_states=True,
            output_attentions=True
        )

        # Teacher forward (same batch)
        with torch.no_grad():
            t_raw = teacher(
                input_ids=b["input_ids"].to(DEVICE),
                attention_mask=b["attention_mask"].to(DEVICE),
                output_hidden_states=True,
                output_attentions=True,
                return_dict=True
            )
            t_out = {"logits": t_raw.logits, "hidden_states": t_raw.hidden_states, "attentions": t_raw.attentions}

        # KD losses
        loss, parts = criterion(s_out, t_out, labels)

        loss.backward()
        nn.utils.clip_grad_norm_(student.parameters(), 1.0)
        opt_s.step(); sch_s.step(); opt_s.zero_grad()
        run_loss += loss.item()

    val = eval_student(val_loader)
    print(f"[Student KD] loss={run_loss/len(train_loader):.4f} | Val F1m={val['f1_macro']:.4f} Acc={val['accuracy']:.4f}")

    if val["f1_macro"] > best_f1:
        best_f1, wait = val["f1_macro"], 0
        torch.save(student.state_dict(), WORK_DIR / "student_best_sharedtok_attKD.pt")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("Early stop student.")
            break

# reload best
student.load_state_dict(torch.load(WORK_DIR / "student_best_sharedtok_attKD.pt", map_location=DEVICE))
student.eval()
print("‚úÖ Student KD complete (logits + hidden + attention).")


[Student KD] Epoch 1:   0%|          | 0/591 [00:00<?, ?it/s]



[Student KD] loss=1.6385 | Val F1m=0.8982 Acc=0.9162


[Student KD] Epoch 2:   0%|          | 0/591 [00:00<?, ?it/s]

[Student KD] loss=0.8814 | Val F1m=0.9103 Acc=0.9306


[Student KD] Epoch 3:   0%|          | 0/591 [00:00<?, ?it/s]

[Student KD] loss=0.6375 | Val F1m=0.9172 Acc=0.9340


[Student KD] Epoch 4:   0%|          | 0/591 [00:00<?, ?it/s]

[Student KD] loss=0.5206 | Val F1m=0.9104 Acc=0.9280
‚úÖ Student KD complete (logits + hidden + attention).


In [9]:
# Test metrics
teacher_test = evaluate_cls(teacher, test_loader)
student_test = eval_student(test_loader)
print("[Teacher][Test]:", teacher_test)
print("[Student][Test]:", student_test)

# -------- Alignment summary (logit cosine, prob corr, prediction agreement) --------
from scipy.special import softmax
from scipy.spatial.distance import cosine

@torch.no_grad()
def evaluate_alignment(teacher, student, loader):
    teacher.eval(); student.eval()
    logits_cos, prob_corr = [], []
    t_preds_all, s_preds_all = [], []

    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        t = teacher(input_ids=b["input_ids"], attention_mask=b["attention_mask"])
        s = student(input_ids=b["input_ids"], attention_mask=b["attention_mask"])

        t_logits = t.logits.detach().cpu().numpy()
        s_logits = s["logits"].detach().cpu().numpy()
        t_probs  = softmax(t_logits, axis=-1)
        s_probs  = softmax(s_logits, axis=-1)

        for tl, sl, tp, sp in zip(t_logits, s_logits, t_probs, s_probs):
            logits_cos.append(1 - cosine(tl, sl))
            prob_corr.append(np.corrcoef(tp, sp)[0, 1])
            t_preds_all.append(np.argmax(tp)); s_preds_all.append(np.argmax(sp))

    t_preds_all = np.array(t_preds_all)
    s_preds_all = np.array(s_preds_all)
    return {
        "logit_cosine": float(np.nanmean(logits_cos)),
        "prob_corr": float(np.nanmean(prob_corr)),
        "pred_alignment": float((t_preds_all == s_preds_all).mean())
    }

alignment = evaluate_alignment(teacher, student, test_loader)

print(f"""
üß© Alignment Results (Test):
  üîπ Logit Cosine Similarity : {alignment['logit_cosine']:.4f}
  üîπ Probability Correlation : {alignment['prob_corr']:.4f}
  üîπ Prediction Agreement    : {alignment['pred_alignment']:.4f}
""")

# save artifacts
save_dir = WORK_DIR / "student_model_sharedtok_hiddenKD"
save_dir.mkdir(parents=True, exist_ok=True)
torch.save(student.state_dict(), save_dir / "pytorch_model.bin")
shared_tok.save_pretrained(save_dir)
with open(save_dir / "student_config.json", "w") as f:
    json.dump({
        "base_model": STUDENT_MODEL_ID,
        "num_labels": 2,
        "shared_tokenizer": TEACHER_MODEL_ID,
        "kd_temperature": KD_T,
        "alpha": KD_ALPHA,
        "gamma_hidden": GAMMA_HIDDEN
    }, f, indent=2, ensure_ascii=False)

with open(WORK_DIR / "metrics_sharedtok_hiddenKD.json", "w") as f:
    json.dump({
        "teacher_test": teacher_test,
        "student_test": student_test,
        "alignment": alignment
    }, f, indent=2, ensure_ascii=False)

print("‚úÖ Saved model + metrics to /kaggle/working")


[Teacher][Test]: {'accuracy': 0.9678238780694327, 'f1_macro': 0.9601959073041282, 'f1_weighted': 0.9678533866604009}
[Student][Test]: {'accuracy': 0.930567315834039, 'f1_macro': 0.9142639101365517, 'f1_weighted': 0.9306939442278058}

üß© Alignment Results (Test):
  üîπ Logit Cosine Similarity : 0.8694
  üîπ Probability Correlation : 0.8713
  üîπ Prediction Agreement    : 0.9356

‚úÖ Saved model + metrics to /kaggle/working


In [10]:
from scipy.special import softmax
import inspect

label_names = {0: "NEGATIVE", 1: "POSITIVE"}

def safe_forward_student(model, enc):
    """
    Safely forwards inputs through student even if tokenizer outputs token_type_ids.
    Automatically filters unsupported kwargs.
    """
    sig = inspect.signature(model.encoder.forward)
    allowed = set(sig.parameters.keys())
    valid_enc = {k: v for k, v in enc.items() if k in allowed or k in ["input_ids", "attention_mask"]}
    return model(**valid_enc)

def test_one(text_bn: str):
    # tokenize and move to GPU
    enc = shared_tok(text_bn, truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        # teacher forward (BanglaBERT)
        t_out = teacher(**enc)
        # student forward (DistilBERT) ‚Äî safely handles extra fields
        s_out = safe_forward_student(student, enc)

    # compute softmax
    t_prob = softmax(t_out.logits.detach().cpu().numpy(), axis=-1)[0]
    s_prob = softmax(s_out["logits"].detach().cpu().numpy(), axis=-1)[0]
    t_pred = int(t_prob.argmax())
    s_pred = int(s_prob.argmax())

    print("üìù Text:", text_bn)
    print(f"üß† Teacher ‚Üí {label_names[t_pred]} (NEG={t_prob[0]:.4f}, POS={t_prob[1]:.4f})")
    print(f"üéì Student ‚Üí {label_names[s_pred]} (NEG={s_prob[0]:.4f}, POS={s_prob[1]:.4f})")
    print("üîÅ Alignment:", "‚úÖ MATCH" if t_pred == s_pred else "‚ö†Ô∏è DIFFERENT")
    print()

# üîç Try a few examples
examples = [
    "‡¶Ü‡¶Æ‡¶ø ‡¶Ü‡¶ú ‡¶ñ‡ßÅ‡¶¨ ‡¶ñ‡ßÅ‡¶∂‡¶ø‡•§",          # positive
    "‡¶è‡¶á ‡¶∏‡¶ø‡¶®‡ßá‡¶Æ‡¶æ‡¶ü‡¶æ ‡¶è‡¶ï‡¶¶‡¶Æ ‡¶¨‡¶æ‡¶ú‡ßá‡•§",     # negative
    "‡¶Ü‡¶ú‡¶ï‡ßá ‡¶¶‡¶ø‡¶®‡¶ü‡¶æ ‡¶∏‡¶§‡ßç‡¶Ø‡¶ø‡¶á ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶õ‡¶ø‡¶≤‡•§",
    "‡¶è‡¶á ‡¶∞‡ßá‡¶∏‡ßç‡¶ü‡ßÅ‡¶∞‡ßá‡¶®‡ßç‡¶ü‡ßá‡¶∞ ‡¶ñ‡¶æ‡¶¨‡¶æ‡¶∞‡¶ü‡¶æ ‡¶≠‡¶æ‡¶≤‡ßã ‡¶®‡¶æ‡•§"
]

for tx in examples:
    print("-" * 60)
    test_one(tx)



------------------------------------------------------------
üìù Text: ‡¶Ü‡¶Æ‡¶ø ‡¶Ü‡¶ú ‡¶ñ‡ßÅ‡¶¨ ‡¶ñ‡ßÅ‡¶∂‡¶ø‡•§
üß† Teacher ‚Üí POSITIVE (NEG=0.0007, POS=0.9993)
üéì Student ‚Üí POSITIVE (NEG=0.0007, POS=0.9993)
üîÅ Alignment: ‚úÖ MATCH

------------------------------------------------------------
üìù Text: ‡¶è‡¶á ‡¶∏‡¶ø‡¶®‡ßá‡¶Æ‡¶æ‡¶ü‡¶æ ‡¶è‡¶ï‡¶¶‡¶Æ ‡¶¨‡¶æ‡¶ú‡ßá‡•§
üß† Teacher ‚Üí NEGATIVE (NEG=0.9945, POS=0.0055)
üéì Student ‚Üí NEGATIVE (NEG=0.9962, POS=0.0038)
üîÅ Alignment: ‚úÖ MATCH

------------------------------------------------------------
üìù Text: ‡¶Ü‡¶ú‡¶ï‡ßá ‡¶¶‡¶ø‡¶®‡¶ü‡¶æ ‡¶∏‡¶§‡ßç‡¶Ø‡¶ø‡¶á ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶õ‡¶ø‡¶≤‡•§
üß† Teacher ‚Üí POSITIVE (NEG=0.0005, POS=0.9995)
üéì Student ‚Üí POSITIVE (NEG=0.0004, POS=0.9996)
üîÅ Alignment: ‚úÖ MATCH

------------------------------------------------------------
üìù Text: ‡¶è‡¶á ‡¶∞‡ßá‡¶∏‡ßç‡¶ü‡ßÅ‡¶∞‡ßá‡¶®‡ßç‡¶ü‡ßá‡¶∞ ‡¶ñ‡¶æ‡¶¨‡¶æ‡¶∞‡¶ü‡¶æ ‡¶≠‡¶æ‡¶≤‡ßã ‡¶®‡¶æ‡•§
üß† Teacher ‚Üí NEGATIVE (NEG=0.9940, POS=0.0060