In [1]:
!pip install --upgrade --no-cache-dir transformers==4.45.2 -q
!pip install --no-cache-dir scikit-learn pandas tqdm matplotlib sentencepiece -q

import os, json, random, numpy as np, torch
from pathlib import Path

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WORK_DIR = Path("/kaggle/working"); WORK_DIR.mkdir(parents=True, exist_ok=True)

# --- Model config
TEACHER_MODEL_ID = "csebuetnlp/banglabert"
STUDENT_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_LEN = 128
BATCH_SIZE = 16

# --- Fine-tuning + KD parameters
EPOCHS_TEACHER = 3
LR_TEACHER = 2e-5
EPOCHS_STUDENT = 5
LR_STUDENT = 3e-5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
PATIENCE = 2

KD_T = 3.0
KD_ALPHA = 0.5
GAMMA_HIDDEN = 1.0

print("‚úÖ Device:", DEVICE)


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.4/44.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.9/9.9 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m566.1/566.1 kB[0m [31m311.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.0/3.0 MB[0m [31m299.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
da

# Imports, Config

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

DATA_DIR = Path("/kaggle/input/dataaaaaa")
POS_FILE = DATA_DIR / "all_positive_8500.txt"
NEG_FILE = DATA_DIR / "all_negative_3307.txt"
assert POS_FILE.exists() and NEG_FILE.exists(), f"Missing files!"

def read_txt(p):
    with open(p, encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

pos = read_txt(POS_FILE)
neg = read_txt(NEG_FILE)

df = pd.DataFrame({"text": pos + neg, "label": [1]*len(pos) + [0]*len(neg)}).sample(frac=1, random_state=SEED)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=SEED)
print(f"Train={len(train_df)} | Val={len(val_df)} | Test={len(test_df)}")


Train=9445 | Val=1181 | Test=1181


# Fine-tune Teacher (BanglaBERT)

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

class TxtClsDataset(torch.utils.data.Dataset):
    def __init__(self, df, tok, max_len):
        self.texts = df.text.tolist(); self.labels = df.label.tolist()
        self.tok, self.max_len = tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(self.texts[i], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[i], dtype=torch.long)
        }

teacher_tok = AutoTokenizer.from_pretrained(TEACHER_MODEL_ID)
teacher = AutoModelForSequenceClassification.from_pretrained(TEACHER_MODEL_ID, num_labels=2).to(DEVICE)

train_loader = DataLoader(TxtClsDataset(train_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TxtClsDataset(val_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)
test_loader = DataLoader(TxtClsDataset(test_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)

opt = AdamW(teacher.parameters(), lr=LR_TEACHER, weight_decay=WEIGHT_DECAY)
steps = len(train_loader) * EPOCHS_TEACHER
sch = get_linear_schedule_with_warmup(opt, int(WARMUP_RATIO * steps), steps)

best_f1 = -1.0
for ep in range(1, EPOCHS_TEACHER+1):
    teacher.train(); run = 0.0
    for b in tqdm(train_loader, desc=f"Teacher Epoch {ep}/{EPOCHS_TEACHER}"):
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = teacher(**b)
        loss = out.loss
        loss.backward()
        opt.step(); sch.step(); opt.zero_grad()
        run += loss.item()

    teacher.eval(); preds, gold = [], []
    with torch.no_grad():
        for b in val_loader:
            b = {k:v.to(DEVICE) for k,v in b.items()}
            out = teacher(**b)
            preds += out.logits.argmax(-1).cpu().tolist()
            gold  += b["labels"].cpu().tolist()
    acc = accuracy_score(gold, preds)
    f1m = f1_score(gold, preds, average="macro")
    print(f"[Val] Acc={acc:.4f} | F1_macro={f1m:.4f}")
    if f1m > best_f1:
        best_f1 = f1m
        save_dir = WORK_DIR / "finetuned_banglabert"
        save_dir.mkdir(parents=True, exist_ok=True)
        teacher.save_pretrained(save_dir)
        teacher_tok.save_pretrained(save_dir)
        print("üíæ Saved best teacher.")


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Teacher Epoch 1/3:   0%|          | 0/591 [00:00<?, ?it/s]

[Val] Acc=0.9577 | F1_macro=0.9489
üíæ Saved best teacher.


Teacher Epoch 2/3:   0%|          | 0/591 [00:00<?, ?it/s]

[Val] Acc=0.9670 | F1_macro=0.9586
üíæ Saved best teacher.


Teacher Epoch 3/3:   0%|          | 0/591 [00:00<?, ?it/s]

[Val] Acc=0.9678 | F1_macro=0.9601
üíæ Saved best teacher.


# Transliteration KD Dataset

In [4]:
from torch.utils.data import Dataset, DataLoader

class KDDataset(Dataset):
    def __init__(self, df, teacher_tok, student_tok, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.ttok = teacher_tok
        self.stok = student_tok
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        t_enc = self.ttok(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        s_enc = self.stok(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "t_input_ids": t_enc["input_ids"].squeeze(),
            "t_attention_mask": t_enc["attention_mask"].squeeze(),
            "s_input_ids": s_enc["input_ids"].squeeze(),
            "s_attention_mask": s_enc["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

from transformers import AutoTokenizer
student_tok = AutoTokenizer.from_pretrained(STUDENT_MODEL_ID)

train_loader = DataLoader(KDDataset(train_df, teacher_tok, student_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(KDDataset(val_df, teacher_tok, student_tok, MAX_LEN), batch_size=BATCH_SIZE)
test_loader  = DataLoader(KDDataset(test_df, teacher_tok, student_tok, MAX_LEN), batch_size=BATCH_SIZE)

print("‚úÖ KD Dataloaders ready (BanglaBERT ‚Üí MiniLM)")


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

‚úÖ KD Dataloaders ready (BanglaBERT ‚Üí MiniLM)


In [5]:
from transformers import AutoModel
import torch.nn.functional as F
import torch.nn as nn

class StudentClassifier(nn.Module):
    def __init__(self, base_model, num_labels=2, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model)
        hidden = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden, num_labels)
    def forward(self, input_ids=None, attention_mask=None, output_hidden_states=True):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask,
                           output_hidden_states=output_hidden_states, return_dict=True)
        cls = out.last_hidden_state[:, 0, :]
        logits = self.fc(self.dropout(cls))
        return {"logits": logits, "hidden_states": out.hidden_states}

student = StudentClassifier(STUDENT_MODEL_ID).to(DEVICE)
print("‚úÖ Student loaded:", STUDENT_MODEL_ID)


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

‚úÖ Student loaded: sentence-transformers/all-MiniLM-L6-v2


# Student model + Logit + Hidden 

In [8]:
import torch.nn.functional as F
import torch.nn as nn

# üéØ Logit-based KD Loss (no hidden-state matching)
class KDLossLogitsOnly(nn.Module):
    def __init__(self, T=3.0, alpha=0.5):
        super().__init__()
        self.T = T
        self.alpha = alpha
        self.ce = nn.CrossEntropyLoss()
        self.kld = nn.KLDivLoss(reduction="batchmean")

    def forward(self, s_pack, t_pack, labels):
        logits_s, logits_t = s_pack["logits"], t_pack["logits"]

        # Standard cross-entropy loss (hard labels)
        hard_loss = self.ce(logits_s, labels)

        # KL divergence loss (soft labels from teacher)
        soft_loss = self.kld(
            F.log_softmax(logits_s / self.T, dim=-1),
            F.softmax(logits_t / self.T, dim=-1)
        ) * (self.T ** 2)

        # Weighted sum
        total_loss = (1 - self.alpha) * hard_loss + self.alpha * soft_loss
        return total_loss

criterion = KDLossLogitsOnly(T=KD_T, alpha=KD_ALPHA)
print("‚úÖ KD Loss ready (logits only ‚Äî no hidden KD)")


‚úÖ KD Loss ready (logits only ‚Äî no hidden KD)


# KD training loop enabling attentions

In [9]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score

# Freeze teacher
teacher.eval()
for p in teacher.parameters(): 
    p.requires_grad = False

# Optimizer + Scheduler
opt = AdamW(student.parameters(), lr=LR_STUDENT, weight_decay=WEIGHT_DECAY)
steps = len(train_loader) * EPOCHS_STUDENT
sch = get_linear_schedule_with_warmup(opt, int(WARMUP_RATIO * steps), steps)

# Helper metrics
def compute_metrics(preds, gold):
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted")
    }

@torch.no_grad()
def eval_student(loader):
    student.eval()
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = student(input_ids=b["s_input_ids"], attention_mask=b["s_attention_mask"])
        preds += out["logits"].argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return compute_metrics(np.array(preds), np.array(gold))

best_f1, wait = -1, 0
for ep in range(1, EPOCHS_STUDENT + 1):
    student.train(); run_loss = 0.0
    for b in tqdm(train_loader, desc=f"[KD Epoch {ep}/{EPOCHS_STUDENT}]"):
        labels = b["labels"].to(DEVICE)

        # Student forward
        s_out = student(
            input_ids=b["s_input_ids"].to(DEVICE),
            attention_mask=b["s_attention_mask"].to(DEVICE),
            output_hidden_states=False
        )

        # Teacher forward (frozen)
        with torch.no_grad():
            t_raw = teacher(
                input_ids=b["t_input_ids"].to(DEVICE),
                attention_mask=b["t_attention_mask"].to(DEVICE),
                output_hidden_states=False,
                return_dict=True
            )
            t_out = {"logits": t_raw.logits}

        # Compute KD loss
        loss = criterion(s_out, t_out, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(student.parameters(), 1.0)
        opt.step(); sch.step(); opt.zero_grad()
        run_loss += loss.item()

    # Validation
    val = eval_student(val_loader)
    print(f"[KD Epoch {ep}] loss={run_loss/len(train_loader):.4f} | "
          f"Val Acc={val['accuracy']:.4f} | F1m={val['f1_macro']:.4f} | F1w={val['f1_weighted']:.4f}")

    if val["f1_macro"] > best_f1:
        best_f1, wait = val["f1_macro"], 0
        torch.save(student.state_dict(), WORK_DIR / "student_best_logitsKD.pt")
        print("üíæ Saved best student checkpoint.")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("‚è∏Ô∏è Early stopping.")
            break

student.load_state_dict(torch.load(WORK_DIR / "student_best_logitsKD.pt", map_location=DEVICE))
student.eval()
print("‚úÖ KD training complete (logits-only).")


[KD Epoch 1/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD Epoch 1] loss=1.1971 | Val Acc=0.8679 | F1m=0.8246 | F1w=0.8629
üíæ Saved best student checkpoint.


[KD Epoch 2/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD Epoch 2] loss=0.7433 | Val Acc=0.8840 | F1m=0.8608 | F1w=0.8858
üíæ Saved best student checkpoint.


[KD Epoch 3/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD Epoch 3] loss=0.6146 | Val Acc=0.8848 | F1m=0.8624 | F1w=0.8868
üíæ Saved best student checkpoint.


[KD Epoch 4/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD Epoch 4] loss=0.5398 | Val Acc=0.8975 | F1m=0.8745 | F1w=0.8981
üíæ Saved best student checkpoint.


[KD Epoch 5/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD Epoch 5] loss=0.4751 | Val Acc=0.9018 | F1m=0.8796 | F1w=0.9023
üíæ Saved best student checkpoint.
‚úÖ KD training complete (logits-only).


In [10]:
from scipy.special import softmax
from scipy.spatial.distance import cosine
import json
import numpy as np
import torch

# --------------------------------------------------------------
# üßæ Evaluate classification metrics
# --------------------------------------------------------------
@torch.no_grad()
def evaluate_model(model, loader, mode="student"):
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        inp = {
            "input_ids": b["t_input_ids"] if mode == "teacher" else b["s_input_ids"],
            "attention_mask": b["t_attention_mask"] if mode == "teacher" else b["s_attention_mask"],
        }
        out = model(**inp)
        logits = out["logits"] if isinstance(out, dict) else out.logits
        preds += logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted"),
    }

print("Evaluating Teacher (BanglaBERT) ...")
teacher_test = evaluate_model(teacher, test_loader, mode="teacher")
print("Evaluating Student (MiniLM) ...")
student_test = evaluate_model(student, test_loader, mode="student")

print("\n===== üìà Test Metrics =====")
print(f"üß† Teacher:  Acc={teacher_test['accuracy']:.4f} | "
      f"F1_macro={teacher_test['f1_macro']:.4f} | "
      f"F1_weighted={teacher_test['f1_weighted']:.4f}")
print(f"üéì Student:  Acc={student_test['accuracy']:.4f} | "
      f"F1_macro={student_test['f1_macro']:.4f} | "
      f"F1_weighted={student_test['f1_weighted']:.4f}")

# --------------------------------------------------------------
# üîó Alignment metrics (cosine, correlation, agreement)
# --------------------------------------------------------------
@torch.no_grad()
def evaluate_alignment(teacher, student, loader):
    cosine_list, corr_list, agree_list = [], [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        t_out = teacher(b["t_input_ids"], b["t_attention_mask"])
        s_out = student(b["s_input_ids"], b["s_attention_mask"])
        t_logits = t_out.logits.detach().cpu().numpy()
        s_logits = s_out["logits"].detach().cpu().numpy()
        t_probs = softmax(t_logits, axis=-1)
        s_probs = softmax(s_logits, axis=-1)

        for tl, sl, tp, sp in zip(t_logits, s_logits, t_probs, s_probs):
            cosine_list.append(1 - cosine(tl, sl))
            corr_list.append(np.corrcoef(tp, sp)[0, 1])
            agree_list.append(np.argmax(tp) == np.argmax(sp))

    return {
        "logit_cosine": float(np.nanmean(cosine_list)),
        "prob_corr": float(np.nanmean(corr_list)),
        "pred_alignment": float(np.mean(agree_list)),
    }

alignment = evaluate_alignment(teacher, student, test_loader)

print(f"""
===== üîó Alignment Metrics =====
üîπ Logit Cosine Similarity : {alignment['logit_cosine']:.4f}
üîπ Probability Correlation : {alignment['prob_corr']:.4f}
üîπ Prediction Agreement    : {alignment['pred_alignment']:.4f}
""")

# --------------------------------------------------------------
# üíæ Save model + metrics
# --------------------------------------------------------------
SAVE_DIR = WORK_DIR / "student_minilm_logitsKD"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
torch.save(student.state_dict(), SAVE_DIR / "pytorch_model.bin")
student_tok.save_pretrained(SAVE_DIR)

metrics = {
    "teacher_test": teacher_test,
    "student_test": student_test,
    "alignment": alignment,
}
with open(WORK_DIR / "metrics_minilm_logitsKD.json", "w") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

print("‚úÖ Saved model + metrics to", SAVE_DIR)


Evaluating Teacher (BanglaBERT) ...
Evaluating Student (MiniLM) ...

===== üìà Test Metrics =====
üß† Teacher:  Acc=0.9687 | F1_macro=0.9611 | F1_weighted=0.9687
üéì Student:  Acc=0.8984 | F1_macro=0.8759 | F1_weighted=0.8991

===== üîó Alignment Metrics =====
üîπ Logit Cosine Similarity : 0.8138
üîπ Probability Correlation : 0.8188
üîπ Prediction Agreement    : 0.9094

‚úÖ Saved model + metrics to /kaggle/working/student_minilm_logitsKD
