In [1]:
!pip install --upgrade --no-cache-dir transformers==4.45.2 -q
!pip install --no-cache-dir scikit-learn pandas tqdm matplotlib sentencepiece -q

import os, json, random, numpy as np, torch
from pathlib import Path

# Repro & device
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WORK_DIR = Path("/kaggle/working"); WORK_DIR.mkdir(parents=True, exist_ok=True)

# Models
TEACHER_MODEL_ID = "xlm-roberta-base"
STUDENT_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"  # student head will be freshly initialized

# Data / training config
MAX_LEN   = 128
BATCH_SIZE = 16

# Teacher FT
EPOCHS_TEACHER   = 3
LR_TEACHER       = 2e-5
WARMUP_RATIO_T   = 0.1
WEIGHT_DECAY_T   = 0.01

# KD (logits-only)
EPOCHS_STUDENT   = 5
LR_STUDENT       = 3e-5
WARMUP_RATIO_S   = 0.1
WEIGHT_DECAY_S   = 0.01
PATIENCE         = 2

KD_T      = 3.0
KD_ALPHA  = 0.5

print("‚úÖ Device:", DEVICE)


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.4/44.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.9/9.9 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m566.1/566.1 kB[0m [31m139.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.0/3.0 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependen

# Imports, Config

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

DATA_DIR = Path("/kaggle/input/dataaaaaa")
POS_FILE = DATA_DIR / "all_positive_8500.txt"
NEG_FILE = DATA_DIR / "all_negative_3307.txt"
assert POS_FILE.exists() and NEG_FILE.exists(), f"Missing files:\n{POS_FILE}\n{NEG_FILE}"

def read_txt(p: Path):
    with open(p, encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

pos = read_txt(POS_FILE)
neg = read_txt(NEG_FILE)

df = pd.DataFrame({
    "text":  pos + neg,
    "label": [1]*len(pos) + [0]*len(neg)
}).sample(frac=1, random_state=SEED).reset_index(drop=True)

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=SEED)
val_df,   test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=SEED)

print(f"Train={len(train_df)} | Val={len(val_df)} | Test={len(test_df)}")
train_df.head()


Train=9445 | Val=1181 | Test=1181


Unnamed: 0,text,label
11010,‡¶Ü‡¶™‡¶®‡¶æ‡¶∞‡ßá ‡¶†‡ßç‡¶Ø‡¶æ‡¶Ç ‡¶Ü‡¶¨‡¶æ‡¶∞‡¶ì ‡¶†‡ßç‡¶Ø‡¶æ‡¶Ç,1
2147,‡¶Æ‡¶æ‡¶® ‡¶∏‡¶Æ‡ßç‡¶Æ‡¶§ ‡¶®‡¶æ‡¶ü‡¶ï ‡¶§‡ßà‡¶∞‡¶ø ‡¶ï‡¶∞‡ßÅ‡¶®‡•§‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶á‡¶Æ‡ßá‡¶ú ‡¶®‡¶∑‡ßç‡¶ü ‡¶ï‡¶∞‡¶¨‡ßá‡¶®...,0
2509,‡¶Ö‡¶®‡ßá‡¶ï ‡¶Ö‡¶®‡ßá‡¶ï ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶π‡ßü‡¶õ‡¶ø,1
8380,‡¶è‡¶ï ‡¶ï‡¶•‡¶æ‡ßü ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶® ‡¶è‡¶ï‡¶ü‡¶æ ‡¶®‡¶æ‡¶ü‡¶ï...‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶¶‡ßá‡¶ñ‡¶æ best ‡¶®‡¶æ‡¶ü...,1
3082,‡¶®‡¶æ‡¶ü‡¶ï‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ù‡ßá ‡¶è‡¶° ‡¶¶‡ßá‡ßü ‡¶ï‡ßá‡¶® ‡¶¨‡¶æ‡¶≤,0


# Fine-tune Teacher xlmr

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score

class TxtClsDataset(Dataset):
    def __init__(self, df, tok, max_len):
        self.texts = df.text.tolist()
        self.labels = df.label.tolist()
        self.tok = tok; self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(self.texts[i], truncation=True, padding="max_length",
                       max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[i], dtype=torch.long)
        }

# Load teacher
teacher_tok = AutoTokenizer.from_pretrained(TEACHER_MODEL_ID)
teacher = AutoModelForSequenceClassification.from_pretrained(TEACHER_MODEL_ID, num_labels=2).to(DEVICE)

# Dataloaders
train_teacher_loader = DataLoader(TxtClsDataset(train_df, teacher_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
val_teacher_loader   = DataLoader(TxtClsDataset(val_df,   teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)
test_teacher_loader  = DataLoader(TxtClsDataset(test_df,  teacher_tok, MAX_LEN), batch_size=BATCH_SIZE)

# Optimizer & scheduler (PyTorch AdamW = no deprecation warning)
opt_t = torch.optim.AdamW(teacher.parameters(), lr=LR_TEACHER, weight_decay=WEIGHT_DECAY_T)
steps_t = len(train_teacher_loader) * EPOCHS_TEACHER
sch_t   = get_linear_schedule_with_warmup(opt_t, int(WARMUP_RATIO_T*steps_t), steps_t)

best_f1 = -1.0
for ep in range(1, EPOCHS_TEACHER+1):
    teacher.train(); run = 0.0
    for b in tqdm(train_teacher_loader, desc=f"Teacher Epoch {ep}/{EPOCHS_TEACHER}"):
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = teacher(**b)
        loss = out.loss
        loss.backward()
        opt_t.step(); sch_t.step(); opt_t.zero_grad()
        run += loss.item()

    # Validate
    teacher.eval(); preds, gold = [], []
    with torch.no_grad():
        for b in val_teacher_loader:
            b = {k:v.to(DEVICE) for k,v in b.items()}
            logits = teacher(**b).logits
            preds += logits.argmax(-1).cpu().tolist()
            gold  += b["labels"].cpu().tolist()
    acc = accuracy_score(gold, preds)
    f1m = f1_score(gold, preds, average="macro")
    print(f"[Val] Acc={acc:.4f} | F1_macro={f1m:.4f}")

    if f1m > best_f1:
        best_f1 = f1m
        save_dir = WORK_DIR / "finetuned_xlmr_teacher"
        save_dir.mkdir(parents=True, exist_ok=True)
        teacher.save_pretrained(save_dir); teacher_tok.save_pretrained(save_dir)
        print("üíæ Saved best teacher to", save_dir)

# Quick test
teacher.eval(); preds, gold = [], []
with torch.no_grad():
    for b in test_teacher_loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        logits = teacher(**b).logits
        preds += logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
print("‚úÖ Teacher[Test]: Acc={:.4f} | F1_macro={:.4f}".format(
    accuracy_score(gold, preds), f1_score(gold, preds, average="macro")))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Teacher Epoch 1/3:   0%|          | 0/591 [00:00<?, ?it/s]

[Val] Acc=0.9433 | F1_macro=0.9318
üíæ Saved best teacher to /kaggle/working/finetuned_xlmr_teacher


Teacher Epoch 2/3:   0%|          | 0/591 [00:00<?, ?it/s]

[Val] Acc=0.9610 | F1_macro=0.9518
üíæ Saved best teacher to /kaggle/working/finetuned_xlmr_teacher


Teacher Epoch 3/3:   0%|          | 0/591 [00:00<?, ?it/s]

[Val] Acc=0.9644 | F1_macro=0.9562
üíæ Saved best teacher to /kaggle/working/finetuned_xlmr_teacher
‚úÖ Teacher[Test]: Acc=0.9543 | F1_macro=0.9438


# Transliteration KD Dataset

In [4]:
from torch.utils.data import Dataset, DataLoader

# Student tokenizer
student_tok = AutoTokenizer.from_pretrained(STUDENT_MODEL_ID)

class KDDataset(Dataset):
    def __init__(self, df, teacher_tok, student_tok, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.ttok = teacher_tok; self.stok = student_tok; self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        txt = self.texts[idx]; label = self.labels[idx]
        t_enc = self.ttok(txt, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        s_enc = self.stok(txt, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "t_input_ids": t_enc["input_ids"].squeeze(0),
            "t_attention_mask": t_enc["attention_mask"].squeeze(0),
            "s_input_ids": s_enc["input_ids"].squeeze(0),
            "s_attention_mask": s_enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

train_loader = DataLoader(KDDataset(train_df, teacher_tok, student_tok, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(KDDataset(val_df,   teacher_tok, student_tok, MAX_LEN), batch_size=BATCH_SIZE)
test_loader  = DataLoader(KDDataset(test_df,  teacher_tok, student_tok, MAX_LEN), batch_size=BATCH_SIZE)

print("‚úÖ KD dataloaders ready (XLM-R ‚Üí MiniLM, logits-only)")


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

‚úÖ KD dataloaders ready (XLM-R ‚Üí MiniLM, logits-only)


# Student model + Logit + Hidden 

In [5]:
from transformers import AutoModelForSequenceClassification

# This will initialize a fresh classification head for 2 labels
student = AutoModelForSequenceClassification.from_pretrained(
    STUDENT_MODEL_ID, num_labels=2
).to(DEVICE)

print("‚úÖ Student ready:", STUDENT_MODEL_ID)


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Student ready: sentence-transformers/all-MiniLM-L6-v2


# KD training loop enabling attentions

In [6]:
import torch.nn as nn
import torch.nn.functional as F

class KDLossLogitsOnly(nn.Module):
    def __init__(self, T=3.0, alpha=0.5):
        super().__init__()
        self.T = T
        self.alpha = alpha
        self.ce = nn.CrossEntropyLoss()
        self.kld = nn.KLDivLoss(reduction="batchmean")

    def forward(self, s_pack, t_pack, labels):
        logits_s = s_pack["logits"] if isinstance(s_pack, dict) else s_pack.logits
        logits_t = t_pack["logits"] if isinstance(t_pack, dict) else t_pack.logits

        hard = self.ce(logits_s, labels)
        soft = self.kld(
            F.log_softmax(logits_s / self.T, dim=-1),
            F.softmax(logits_t / self.T, dim=-1)
        ) * (self.T ** 2)

        return (1 - self.alpha) * hard + self.alpha * soft

criterion = KDLossLogitsOnly(T=KD_T, alpha=KD_ALPHA)
print("‚úÖ KD criterion (logits-only) ready.")


‚úÖ KD criterion (logits-only) ready.


In [7]:
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score

# Freeze teacher
teacher.eval()
for p in teacher.parameters(): 
    p.requires_grad = False

# Optimizer & scheduler for student (PyTorch AdamW)
opt_s = torch.optim.AdamW(student.parameters(), lr=LR_STUDENT, weight_decay=WEIGHT_DECAY_S)
steps_s = len(train_loader) * EPOCHS_STUDENT
sch_s   = get_linear_schedule_with_warmup(opt_s, int(WARMUP_RATIO_S*steps_s), steps_s)

def compute_metrics(preds, gold):
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted"),
    }

@torch.no_grad()
def eval_student(loader):
    student.eval()
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        out = student(input_ids=b["s_input_ids"], attention_mask=b["s_attention_mask"])
        preds += out.logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return compute_metrics(np.array(preds), np.array(gold))

best_f1, wait = -1.0, 0
for ep in range(1, EPOCHS_STUDENT+1):
    student.train(); run = 0.0
    for b in tqdm(train_loader, desc=f"[KD Epoch {ep}/{EPOCHS_STUDENT}]"):
        labels = b["labels"].to(DEVICE)

        # Student forward
        s_out = student(input_ids=b["s_input_ids"].to(DEVICE),
                        attention_mask=b["s_attention_mask"].to(DEVICE),
                        output_hidden_states=False)

        # Teacher forward (frozen)
        with torch.no_grad():
            t_out = teacher(input_ids=b["t_input_ids"].to(DEVICE),
                            attention_mask=b["t_attention_mask"].to(DEVICE),
                            output_hidden_states=False,
                            return_dict=True)

        loss = criterion(s_out, t_out, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(student.parameters(), 1.0)
        opt_s.step(); sch_s.step(); opt_s.zero_grad()
        run += loss.item()

    val = eval_student(val_loader)
    print(f"[KD] loss={run/len(train_loader):.4f} | "
          f"Val Acc={val['accuracy']:.4f} | F1m={val['f1_macro']:.4f} | F1w={val['f1_weighted']:.4f}")

    if val["f1_macro"] > best_f1:
        best_f1, wait = val["f1_macro"], 0
        torch.save(student.state_dict(), WORK_DIR / "student_best_logitsKD.pt")
        print("üíæ Saved best student.")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("‚è∏Ô∏è Early stopping.")
            break

# Reload best
student.load_state_dict(torch.load(WORK_DIR / "student_best_logitsKD.pt", map_location=DEVICE))
student.eval()
print("‚úÖ KD training complete (logits-only).")


[KD Epoch 1/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=1.2559 | Val Acc=0.8349 | F1m=0.7748 | F1w=0.8259
üíæ Saved best student.


[KD Epoch 2/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.8311 | Val Acc=0.8645 | F1m=0.8159 | F1w=0.8575
üíæ Saved best student.


[KD Epoch 3/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.6040 | Val Acc=0.8831 | F1m=0.8491 | F1w=0.8806
üíæ Saved best student.


[KD Epoch 4/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.5084 | Val Acc=0.8967 | F1m=0.8700 | F1w=0.8959
üíæ Saved best student.


[KD Epoch 5/5]:   0%|          | 0/591 [00:00<?, ?it/s]

[KD] loss=0.4545 | Val Acc=0.8950 | F1m=0.8706 | F1w=0.8953
üíæ Saved best student.
‚úÖ KD training complete (logits-only).


In [8]:
from scipy.special import softmax
from scipy.spatial.distance import cosine

@torch.no_grad()
def evaluate_model(model, loader, mode="student"):
    preds, gold = [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        inp = {"input_ids": b["t_input_ids"], "attention_mask": b["t_attention_mask"]} if mode=="teacher" else \
              {"input_ids": b["s_input_ids"], "attention_mask": b["s_attention_mask"]}
        out = model(**inp)
        logits = out.logits if hasattr(out, "logits") else out["logits"]
        preds += logits.argmax(-1).cpu().tolist()
        gold  += b["labels"].cpu().tolist()
    return {
        "accuracy": accuracy_score(gold, preds),
        "f1_macro": f1_score(gold, preds, average="macro"),
        "f1_weighted": f1_score(gold, preds, average="weighted"),
    }

print("üß™ Evaluating on test‚Ä¶")
teacher_test = evaluate_model(teacher, test_loader, mode="teacher")
student_test = evaluate_model(student, test_loader, mode="student")
print("[Teacher][Test]:", teacher_test)
print("[Student][Test]:", student_test)

@torch.no_grad()
def evaluate_alignment(teacher, student, loader):
    cos_list, corr_list, agree_list = [], [], []
    for b in loader:
        b = {k:v.to(DEVICE) for k,v in b.items()}
        t_out = teacher(b["t_input_ids"], b["t_attention_mask"])
        s_out = student(b["s_input_ids"], b["s_attention_mask"])
        t_logits = t_out.logits.detach().cpu().numpy()
        s_logits = s_out.logits.detach().cpu().numpy()
        t_probs  = softmax(t_logits, axis=-1)
        s_probs  = softmax(s_logits, axis=-1)
        for tl, sl, tp, sp in zip(t_logits, s_logits, t_probs, s_probs):
            cos_list.append(1 - cosine(tl, sl))
            corr_list.append(np.corrcoef(tp, sp)[0, 1])
            agree_list.append(np.argmax(tp) == np.argmax(sp))
    return {
        "logit_cosine": float(np.nanmean(cos_list)),
        "prob_corr": float(np.nanmean(corr_list)),
        "pred_alignment": float(np.mean(agree_list))
    }

alignment = evaluate_alignment(teacher, student, test_loader)
print(f"""
===== üîó Alignment (Test) =====
üîπ Logit Cosine Similarity : {alignment['logit_cosine']:.4f}
üîπ Probability Correlation : {alignment['prob_corr']:.4f}
üîπ Prediction Agreement    : {alignment['pred_alignment']:.4f}
""")

# Save artifacts
SAVE_DIR = WORK_DIR / "student_minilm_logitsKD"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
torch.save(student.state_dict(), SAVE_DIR / "pytorch_model.bin")
student_tok.save_pretrained(SAVE_DIR)

# serialize to pure Python types
def to_py(obj):
    if isinstance(obj, dict): return {k: to_py(v) for k,v in obj.items()}
    if hasattr(obj, "item"): return obj.item()
    return obj

with open(WORK_DIR / "metrics_minilm_logitsKD.json", "w") as f:
    json.dump({"teacher_test": to_py(teacher_test),
               "student_test": to_py(student_test),
               "alignment": to_py(alignment)}, f, indent=2, ensure_ascii=False)

print("‚úÖ Saved model to", SAVE_DIR, "and metrics JSON to", WORK_DIR / "metrics_minilm_logitsKD.json")


üß™ Evaluating on test‚Ä¶
[Teacher][Test]: {'accuracy': 0.9542760372565622, 'f1_macro': 0.9438450972104819, 'f1_weighted': 0.9544809673164066}
[Student][Test]: {'accuracy': 0.88653683319221, 'f1_macro': 0.8604018912529552, 'f1_weighted': 0.8869459907959556}

===== üîó Alignment (Test) =====
üîπ Logit Cosine Similarity : 0.8091
üîπ Probability Correlation : 0.8103
üîπ Prediction Agreement    : 0.9052

‚úÖ Saved model to /kaggle/working/student_minilm_logitsKD and metrics JSON to /kaggle/working/metrics_minilm_logitsKD.json
