In [None]:
!pip install -U transformers==4.43.3 accelerate==0.33.0 peft==0.11.1 trl==0.9.6 datasets==2.20.0 pandas==2.2.2
!pip install -U bitsandbytes==0.43.1
!pip install -U triton==2.3.0


# Klue -Roberta - small

In [None]:
# -*- coding: utf-8 -*-
import os, random
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed,
)
from google.colab import drive
drive.mount('/content/drive')

# ===============================
# 0) ÏãúÎìú Í≥†Ï†ï
# ===============================
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] device: {device}")

# ===============================
# 1) Îç∞Ïù¥ÌÑ∞ Î°úÎî©
# ===============================
CSV_PATH = r"/content/drive/MyDrive/KDH/dataset/llm_datasets_0817_non_utf8.csv"
df = pd.read_csv(CSV_PATH)

df = df.dropna(subset=["label", "text"])
df = df[df["label"].isin([0, 1])]
df = df.reset_index(drop=True)
df["label"] = df["label"].astype(int)
df["text"]  = df["text"].astype(str)

print("[INFO] Dataset size:", len(df))
print(df["label"].value_counts())

# ===============================
# 2) Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=SEED
)

# ===============================
# 3) Tokenizer & Dataset
# ===============================
MODEL_NAME = "klue/roberta-small"   # ‚úÖ Roberta-small ÏÇ¨Ïö©
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.model_max_length = 512

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True)

train_ds = Dataset.from_dict({"text": X_train.tolist(), "labels": y_train.tolist()})
test_ds  = Dataset.from_dict({"text": X_test.tolist(),  "labels": y_test.tolist()})

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===============================
# 4) Î™®Îç∏ Î°úÎî©
# ===============================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2
).to(device)

# ===============================
# 5) ÌïôÏäµ ÏÑ§Ï†ï
# ===============================
training_args = TrainingArguments(
    output_dir="./roberta-small-classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    warmup_ratio=0.1,
    report_to="none"
)

from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ===============================
# 6) ÌïôÏäµ
# ===============================
trainer.train()

# ===============================
# 7) ÏòàÏ∏° ÌõÑ Confusion Matrix Ï∂úÎ†•
# ===============================
out = trainer.predict(test_ds)
logits = out.predictions
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
y_true = out.label_ids
y_pred = probs.argmax(axis=-1)

cm = confusion_matrix(y_true, y_pred)
print("=== Confusion Matrix ===")
print(cm)

# ============================
# Ï∂îÎ°† ÏÜçÎèÑ + ÌååÎùºÎØ∏ÌÑ∞ Ïàò + Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ
# -*- coding: utf-8 -*-
import os, random, time, psutil
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed,
)
from google.colab import drive
drive.mount('/content/drive')

# ===============================
# 0) ÏãúÎìú Í≥†Ï†ï
# ===============================
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] device: {device}")

# ===============================
# 1) Îç∞Ïù¥ÌÑ∞ Î°úÎî©
# ===============================
CSV_PATH = r"/content/drive/MyDrive/KDH/dataset/llm_datasets_0817_non_utf8.csv"
df = pd.read_csv(CSV_PATH)

df = df.dropna(subset=["label", "text"])
df = df[df["label"].isin([0, 1])]
df = df.reset_index(drop=True)
df["label"] = df["label"].astype(int)
df["text"]  = df["text"].astype(str)

print("[INFO] Dataset size:", len(df))
print(df["label"].value_counts())

# ===============================
# 2) Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=SEED
)

# ===============================
# 3) Tokenizer & Dataset
# ===============================
MODEL_NAME = "klue/roberta-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.model_max_length = 512

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True)

train_ds = Dataset.from_dict({"text": X_train.tolist(), "labels": y_train.tolist()})
test_ds  = Dataset.from_dict({"text": X_test.tolist(),  "labels": y_test.tolist()})

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===============================
# 4) Î™®Îç∏ Î°úÎî©
# ===============================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2
).to(device)

# ===============================
# 5) ÌïôÏäµ ÏÑ§Ï†ï
# ===============================
training_args = TrainingArguments(
    output_dir="./roberta-small-classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    warmup_ratio=0.1,
    report_to="none"
)

from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ===============================
# 6) ÌïôÏäµ
# ===============================
trainer.train()

# ===============================
# 7) ÏòàÏ∏° ÌõÑ Confusion Matrix Ï∂úÎ†•
# ===============================
out = trainer.predict(test_ds)
logits = out.predictions
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
y_true = out.label_ids
y_pred = probs.argmax(axis=-1)

cm = confusion_matrix(y_true, y_pred)
print("=== Confusion Matrix ===")
print(cm)

# ===============================
# 8) Ï∂îÍ∞Ä ÏÑ±Îä•/Ìö®Ïú® ÏßÄÌëú
# ===============================
# ===============================
# 8) Ï∂îÍ∞Ä ÏÑ±Îä•/Ìö®Ïú® ÏßÄÌëú  (ÍµêÏ≤¥Î≥∏)
# ===============================
from torch.utils.data import DataLoader

# (1) Ï∂îÎ°† ÏÜçÎèÑ Ï∏°Ï†ï ‚Äî DataLoader + data_collator ÏÇ¨Ïö©(Ìå®Îî©/ÌÉÄÏûÖ ÏïàÏ†Ñ)
eval_loader = DataLoader(test_ds, batch_size=32, collate_fn=data_collator)

batch = next(iter(eval_loader))              # {'input_ids': [B,L], 'attention_mask': [B,L], 'labels': [B]}
batch.pop("labels", None)                    # ÎùºÎ≤® Ï†úÍ±∞
batch = {k: v.to(device) for k, v in batch.items()}  # ÎîîÎ∞îÏù¥Ïä§ Ïù¥Îèô

model.eval()
with torch.no_grad():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.reset_peak_memory_stats()  # ‚¨ÖÔ∏è Ï∂îÎ°† ÌîºÌÅ¨Î©îÎ™®Î¶¨Îßå Î≥¥Î†§Î©¥ Î¶¨ÏÖã
    t0 = time.time()
    _ = model(**batch)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t1 = time.time()

B = batch["input_ids"].size(0)
latency = (t1 - t0) / B
throughput = B / (t1 - t0)

print(f"\n=== Ï∂îÎ°† ÏÜçÎèÑ ===")
print(f"ÌèâÍ∑† Latency: {latency*1000:.2f} ms/ÏÉòÌîå")
print(f"Throughput : {throughput:.2f} ÏÉòÌîå/Ï¥à")

# (2) ÌååÎùºÎØ∏ÌÑ∞ Ïàò
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n=== ÌååÎùºÎØ∏ÌÑ∞ Ïàò ===")
print(f"Ï†ÑÏ≤¥ ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {total_params/1e6:.2f} M")
print(f"ÌïôÏäµÍ∞ÄÎä• ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {trainable_params/1e6:.2f} M")

# (3) Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ
print(f"\n=== Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ===")
if torch.cuda.is_available():
    allocated = torch.cuda.max_memory_allocated(device) / (1024**2)
    reserved  = torch.cuda.max_memory_reserved(device)  / (1024**2)
    print(f"GPU Î©îÎ™®Î¶¨ (Ìï†Îãπ): {allocated:.2f} MB")
    print(f"GPU Î©îÎ™®Î¶¨ (ÏòàÏïΩ): {reserved:.2f} MB")
else:
    print(f"CPU Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ: {psutil.Process(os.getpid()).memory_info().rss/(1024**2):.2f} MB")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[INFO] device: cuda
[INFO] Dataset size: 1033
label
1    702
0    331
Name: count, dtype: int64


Map:   0%|          | 0/826 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2353,0.18216,0.917874,0.939502,0.942857,0.93617
2,0.2357,0.250313,0.937198,0.953737,0.957143,0.950355
3,0.0898,0.14999,0.961353,0.971831,0.965035,0.978723
4,0.0153,0.299058,0.937198,0.953737,0.957143,0.950355
5,0.0005,0.308642,0.942029,0.957143,0.964029,0.950355


=== Confusion Matrix ===
[[ 61   5]
 [  3 138]]
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[INFO] device: cuda
[INFO] Dataset size: 1033
label
1    702
0    331
Name: count, dtype: int64


Map:   0%|          | 0/826 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2353,0.18216,0.917874,0.939502,0.942857,0.93617
2,0.2357,0.250313,0.937198,0.953737,0.957143,0.950355
3,0.0898,0.14999,0.961353,0.971831,0.965035,0.978723
4,0.0153,0.299058,0.937198,0.953737,0.957143,0.950355
5,0.0005,0.308642,0.942029,0.957143,0.964029,0.950355


=== Confusion Matrix ===
[[ 61   5]
 [  3 138]]

=== Ï∂îÎ°† ÏÜçÎèÑ ===
ÌèâÍ∑† Latency: 0.40 ms/ÏÉòÌîå
Throughput : 2499.26 ÏÉòÌîå/Ï¥à

=== ÌååÎùºÎØ∏ÌÑ∞ Ïàò ===
Ï†ÑÏ≤¥ ÌååÎùºÎØ∏ÌÑ∞ Ïàò: 68.09 M
ÌïôÏäµÍ∞ÄÎä• ÌååÎùºÎØ∏ÌÑ∞ Ïàò: 68.09 M

=== Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ===
GPU Î©îÎ™®Î¶¨ (Ìï†Îãπ): 1152.30 MB
GPU Î©îÎ™®Î¶¨ (ÏòàÏïΩ): 1874.00 MB


# Klue-Roberta-small + base - Knowledge Distillation

In [None]:
# -*- coding: utf-8 -*-
import os, random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed,
)
from google.colab import drive
drive.mount('/content/drive')

# ===============================
# 0) ÏãúÎìú Í≥†Ï†ï
# ===============================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] device: {device}")

# ===============================
# 1) Îç∞Ïù¥ÌÑ∞ Î°úÎî©
# ===============================
CSV_PATH = r"/content/drive/MyDrive/KDH/dataset/llm_datasets_0817_non_utf8.csv"
df = pd.read_csv(CSV_PATH)

df = df.dropna(subset=["label", "text"]).copy()
df = df[df["label"].isin([0, 1])].copy()
df["label"] = df["label"].astype(np.int64)   # torch.long

df["text"]  = df["text"].astype(str)
df = df.reset_index(drop=True)

print("[INFO] unique labels:", sorted(df["label"].unique()))

# ===============================
# 2) Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=SEED
)

train_ds = Dataset.from_dict({
    "text": X_train.tolist(),
    "labels": y_train.astype(np.int64).tolist()
})

test_ds  = Dataset.from_dict({"text": X_test.tolist(),
                              "labels": y_test.astype(np.int64).tolist()})

# ===============================
# 3) Tokenizer (ÌïôÏÉù Í∏∞Ï§Ä)
# ===============================
teacher_model_name = "klue/roberta-base"
student_model_name = "klue/roberta-small"

# ‚úÖ Roberta tokenizer ÌïòÎÇòÎßå ÏÇ¨Ïö©
tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_enc = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_enc  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# ‚úÖ collatorÎèÑ Í∞ôÏùÄ tokenizer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# ===============================
# 4) Î™®Îç∏ Î°úÎî©
# ===============================
teacher = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name, num_labels=2
).to(device)

student = AutoModelForSequenceClassification.from_pretrained(
    student_model_name, num_labels=2
).to(device)

teacher.eval()  # ÍµêÏÇ¨Îäî Í≥†Ï†ï(freeze)

# ===============================
# 5) Distillation Trainer Ï†ïÏùò
# ===============================
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model.eval()
        self.temperature = temperature
        self.alpha = alpha
        self.ce_loss = nn.CrossEntropyLoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"].view(-1).to(torch.long)


        # ÎùºÎ≤® shape/dtype Î≥¥Ï†ï
        labels = labels.view(-1).to(torch.long)

        student_inputs = {k: v for k, v in inputs.items() if k != "labels"}
        outputs_student = model(**student_inputs)
        student_logits = (
            outputs_student.logits if hasattr(outputs_student, "logits") else outputs_student[0]
        )
        loss_ce = self.ce_loss(student_logits, labels)

        # üëâ ÍµêÏÇ¨ ÏûÖÎ†• ÎßåÎì§Í∏∞
        teacher_inputs = {k: v for k, v in inputs.items() if k != "labels"}
        with torch.no_grad():
            outputs_teacher = self.teacher(**teacher_inputs)
            teacher_logits = (
                outputs_teacher.logits if hasattr(outputs_teacher, "logits") else outputs_teacher[0]
            )


        # NaN/Inf guard
        if torch.isnan(student_logits).any() or torch.isnan(teacher_logits).any():
            raise FloatingPointError("NaN detected in logits")

        # KD loss
        T = self.temperature
        loss_kd = F.kl_div(
            F.log_softmax(student_logits / T, dim=-1),
            F.softmax(teacher_logits / T, dim=-1),
            reduction="batchmean"
        ) * (T * T)

        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kd
        return (loss, {"logits": student_logits}) if return_outputs else loss




# ===============================
# 6) ÌïôÏäµ ÏÑ§Ï†ï
# ===============================
training_args = TrainingArguments(
    output_dir="./distilled-klue/roberta-small",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=False,              # ColabÏóêÏÑú ÏïàÏ†ïÏÑ± ÏúÑÌï¥ ÎÅî
    warmup_ratio=0.1,
    report_to="none",
    save_safetensors=False   # ‚úÖ contiguous ÏóêÎü¨ Î∞©ÏßÄ
)

# ===============================
# 7) ÌèâÍ∞ÄÏßÄÌëú
# ===============================
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    # ‚úÖ predictionsÍ∞Ä tupleÏùº Îïå Ï≤òÎ¶¨
    if isinstance(logits, tuple):
        if len(logits) > 0:
            logits = logits[0]
        else:
            raise ValueError("logits is an empty tuple!")

    preds = logits.argmax(axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}




# ===============================
# 8) Trainer Ïã§Ìñâ
# ===============================
trainer = DistillationTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=train_enc,
    eval_dataset=test_enc,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# ===============================
# 9) Confusion Matrix Ï∂úÎ†•
# ===============================
out = trainer.predict(test_enc)
logits = out.predictions
probs = torch.softmax(torch.from_numpy(logits), dim=1).numpy()
y_true = out.label_ids
y_pred = probs.argmax(axis=-1)

cm = confusion_matrix(y_true, y_pred)
print("=== Confusion Matrix ===")
print(cm)

print("=== Classification Report ===")
print(classification_report(y_true, y_pred, digits=4))

# ===============================
# 10) Ï∂îÍ∞Ä ÏÑ±Îä•/Ìö®Ïú® ÏßÄÌëú  (ÏàòÏ†ïÎ≥∏)
# ===============================
import time, psutil
from torch.utils.data import DataLoader

# ‚úÖ Î∞òÎìúÏãú ÌÜ†ÌÅ¨ÎÇòÏù¥Ï¶àÎêú ÌèâÍ∞ÄÏÖã ÏÇ¨Ïö©
eval_loader = DataLoader(test_enc, batch_size=32, collate_fn=data_collator)

# Î∞∞Ïπò ÌïòÎÇò Í∞ÄÏ†∏Ïò§Í∏∞
batch = next(iter(eval_loader))  # dict_keys(['input_ids','attention_mask','labels'])
# ÎùºÎ≤® Ï†úÍ±∞ + ÎîîÎ∞îÏù¥Ïä§ Ïù¥Îèô
batch_no_label = {k: v.to(device) for k, v in batch.items() if k != "labels"}

# ‚úÖ Î≥ÄÏàòÎ™Ö ÏùºÏπò: student Î™®Îç∏Î°ú Ï∂îÎ°†
student.eval()
with torch.no_grad():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.reset_peak_memory_stats()  # ÌîºÌÅ¨ Î©îÎ™®Î¶¨ Î¶¨ÏÖã
    t0 = time.time()
    _ = student(**batch_no_label)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t1 = time.time()

B = batch_no_label["input_ids"].size(0)
latency = (t1 - t0) / B
throughput = B / (t1 - t0)

print(f"\n=== Ï∂îÎ°† ÏÜçÎèÑ ===")
print(f"ÌèâÍ∑† Latency: {latency*1000:.2f} ms/ÏÉòÌîå")
print(f"Throughput : {throughput:.2f} ÏÉòÌîå/Ï¥à")

# (2) ÌååÎùºÎØ∏ÌÑ∞ Ïàò ‚Äî student Í∏∞Ï§Ä
total_params = sum(p.numel() for p in student.parameters())
trainable_params = sum(p.numel() for p in student.parameters() if p.requires_grad)
print(f"\n=== ÌååÎùºÎØ∏ÌÑ∞ Ïàò ===")
print(f"Ï†ÑÏ≤¥ ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {total_params/1e6:.2f} M")
print(f"ÌïôÏäµÍ∞ÄÎä• ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {trainable_params/1e6:.2f} M")

# (3) Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ
print(f"\n=== Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ===")
if torch.cuda.is_available():
    allocated = torch.cuda.max_memory_allocated(device) / (1024**2)
    reserved  = torch.cuda.max_memory_reserved(device)  / (1024**2)
    print(f"GPU Î©îÎ™®Î¶¨ (Ìï†Îãπ): {allocated:.2f} MB")
    print(f"GPU Î©îÎ™®Î¶¨ (ÏòàÏïΩ): {reserved:.2f} MB")
else:
    print(f"CPU Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ: {psutil.Process(os.getpid()).memory_info().rss/(1024**2):.2f} MB")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[INFO] device: cuda
[INFO] unique labels: [0, 1]


Map:   0%|          | 0/826 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.245622,0.927536,0.925676,0.971631,0.948097
2,No log,0.233949,0.937198,0.950704,0.957447,0.954064
3,No log,0.230061,0.961353,0.958621,0.985816,0.972028
4,No log,0.234607,0.932367,0.937931,0.964539,0.951049
5,No log,0.2338,0.94686,0.985075,0.93617,0.96


=== Confusion Matrix ===
[[ 60   6]
 [  2 139]]
=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9677    0.9091    0.9375        66
           1     0.9586    0.9858    0.9720       141

    accuracy                         0.9614       207
   macro avg     0.9632    0.9475    0.9548       207
weighted avg     0.9615    0.9614    0.9610       207


=== Ï∂îÎ°† ÏÜçÎèÑ ===
ÌèâÍ∑† Latency: 0.28 ms/ÏÉòÌîå
Throughput : 3578.28 ÏÉòÌîå/Ï¥à

=== ÌååÎùºÎØ∏ÌÑ∞ Ïàò ===
Ï†ÑÏ≤¥ ÌååÎùºÎØ∏ÌÑ∞ Ïàò: 68.09 M
ÌïôÏäµÍ∞ÄÎä• ÌååÎùºÎØ∏ÌÑ∞ Ïàò: 68.09 M

=== Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ===
GPU Î©îÎ™®Î¶¨ (Ìï†Îãπ): 1698.07 MB
GPU Î©îÎ™®Î¶¨ (ÏòàÏïΩ): 2400.00 MB


# torch Pruning

In [None]:
# -*- coding: utf-8 -*-
import os, random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.prune as prune

from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed,
)
from google.colab import drive
drive.mount('/content/drive')

# ===============================
# 0) ÏãúÎìú Í≥†Ï†ï
# ===============================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] device: {device}")

# ===============================
# 1) Îç∞Ïù¥ÌÑ∞ Î°úÎî©
# ===============================
CSV_PATH = r"/content/drive/MyDrive/KDH/dataset/llm_datasets_0817_non_utf8.csv"
df = pd.read_csv(CSV_PATH)

df = df.dropna(subset=["label", "text"]).copy()
df = df[df["label"].isin([0, 1])].copy()
df["label"] = df["label"].astype(np.int64)   # torch.long
df["text"]  = df["text"].astype(str)
df = df.reset_index(drop=True)

print("[INFO] unique labels:", sorted(df["label"].unique()))

# ===============================
# 2) Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=SEED
)

train_ds = Dataset.from_dict({
    "text": X_train.tolist(),
    "labels": y_train.astype(np.int64).tolist()
})

test_ds  = Dataset.from_dict({
    "text": X_test.tolist(),
    "labels": y_test.astype(np.int64).tolist()
})

# ===============================
# 3) Tokenizer
# ===============================
teacher_model_name = "klue/roberta-base"
student_model_name = "klue/roberta-small"

tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_enc = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_enc  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===============================
# 4) Î™®Îç∏ Î°úÎî©
# ===============================
teacher = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name, num_labels=2
).to(device)

student = AutoModelForSequenceClassification.from_pretrained(
    student_model_name, num_labels=2
).to(device)

teacher.eval()

# ===============================
# 5) Distillation Trainer
# ===============================
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model.eval()
        self.temperature = temperature
        self.alpha = alpha
        self.ce_loss = nn.CrossEntropyLoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"].view(-1).to(torch.long)

        student_inputs = {k: v for k, v in inputs.items() if k != "labels"}
        outputs_student = model(**student_inputs)

        # ‚úÖ logits ÏïàÏ†Ñ Ï≤òÎ¶¨
        student_logits = outputs_student.logits if hasattr(outputs_student, "logits") else outputs_student[0]

        loss_ce = self.ce_loss(student_logits, labels)

        # teacher logits
        with torch.no_grad():
            outputs_teacher = self.teacher(**student_inputs)
            teacher_logits = outputs_teacher.logits if hasattr(outputs_teacher, "logits") else outputs_teacher[0]

        # KD loss
        T = self.temperature
        loss_kd = F.kl_div(
            F.log_softmax(student_logits / T, dim=-1),
            F.softmax(teacher_logits / T, dim=-1),
            reduction="batchmean"
        ) * (T * T)

        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kd
        return (loss, {"logits": student_logits}) if return_outputs else loss

# ===============================
# 6) ÌïôÏäµ ÏÑ§Ï†ï
# ===============================
training_args = TrainingArguments(
    output_dir="./distilled-klue/roberta-small",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=False,
    warmup_ratio=0.1,
    report_to="none",
    save_safetensors=False
)

# ===============================
# 7) ÌèâÍ∞ÄÏßÄÌëú
# ===============================
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    if isinstance(logits, tuple):
        logits = logits[0]

    preds = logits.argmax(axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

# ===============================
# 8) KD ÌïôÏäµ
# ===============================
trainer = DistillationTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=train_enc,
    eval_dataset=test_enc,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# ===============================
# 9) Pruning Ï†ÅÏö©
# ===============================
print("\n[INFO] Applying pruning ...")

for name, module in student.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name="weight", amount=0.5)

# Sparsity ÌôïÏù∏
for name, module in student.named_modules():
    if isinstance(module, torch.nn.Linear) and hasattr(module, "weight_mask"):
        sparsity = 100. * float(torch.sum(module.weight == 0)) / module.weight.nelement()
        print(f"Layer {name} | Sparsity: {sparsity:.2f}%")

# (ÏÑ†ÌÉù) mask Î≥ëÌï©
for name, module in student.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.remove(module, "weight")

# ===============================
# 10) ÌèâÍ∞Ä
# ===============================
out = trainer.predict(test_enc)
logits = out.predictions
probs = torch.softmax(torch.from_numpy(logits), dim=1).numpy()
y_true = out.label_ids
y_pred = probs.argmax(axis=-1)

cm = confusion_matrix(y_true, y_pred)
print("=== Confusion Matrix ===")
print(cm)

print("=== Classification Report ===")
print(classification_report(y_true, y_pred, digits=4))

# ===============================
# 11) Ï†ÄÏû•
# ===============================
student.save_pretrained("./distilled-roberta-pruned")
tokenizer.save_pretrained("./distilled-roberta-pruned")

# 10) Ï∂îÍ∞Ä ÏÑ±Îä•/Ìö®Ïú® ÏßÄÌëú  (ÏàòÏ†ïÎ≥∏)
# ===============================
import time, psutil
from torch.utils.data import DataLoader

# ‚úÖ Î∞òÎìúÏãú ÌÜ†ÌÅ¨ÎÇòÏù¥Ï¶àÎêú ÌèâÍ∞ÄÏÖã ÏÇ¨Ïö©
eval_loader = DataLoader(test_enc, batch_size=32, collate_fn=data_collator)

# Î∞∞Ïπò ÌïòÎÇò Í∞ÄÏ†∏Ïò§Í∏∞
batch = next(iter(eval_loader))  # dict_keys(['input_ids','attention_mask','labels'])
# ÎùºÎ≤® Ï†úÍ±∞ + ÎîîÎ∞îÏù¥Ïä§ Ïù¥Îèô
batch_no_label = {k: v.to(device) for k, v in batch.items() if k != "labels"}

# ‚úÖ Î≥ÄÏàòÎ™Ö ÏùºÏπò: student Î™®Îç∏Î°ú Ï∂îÎ°†
student.eval()
with torch.no_grad():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.reset_peak_memory_stats()  # ÌîºÌÅ¨ Î©îÎ™®Î¶¨ Î¶¨ÏÖã
    t0 = time.time()
    _ = student(**batch_no_label)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t1 = time.time()

B = batch_no_label["input_ids"].size(0)
latency = (t1 - t0) / B
throughput = B / (t1 - t0)

print(f"\n=== Ï∂îÎ°† ÏÜçÎèÑ ===")
print(f"ÌèâÍ∑† Latency: {latency*1000:.2f} ms/ÏÉòÌîå")
print(f"Throughput : {throughput:.2f} ÏÉòÌîå/Ï¥à")

# (2) ÌååÎùºÎØ∏ÌÑ∞ Ïàò ‚Äî student Í∏∞Ï§Ä
total_params = sum(p.numel() for p in student.parameters())
trainable_params = sum(p.numel() for p in student.parameters() if p.requires_grad)
print(f"\n=== ÌååÎùºÎØ∏ÌÑ∞ Ïàò ===")
print(f"Ï†ÑÏ≤¥ ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {total_params/1e6:.2f} M")
print(f"ÌïôÏäµÍ∞ÄÎä• ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {trainable_params/1e6:.2f} M")

# (3) Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ
print(f"\n=== Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ===")
if torch.cuda.is_available():
    allocated = torch.cuda.max_memory_allocated(device) / (1024**2)
    reserved  = torch.cuda.max_memory_reserved(device)  / (1024**2)
    print(f"GPU Î©îÎ™®Î¶¨ (Ìï†Îãπ): {allocated:.2f} MB")
    print(f"GPU Î©îÎ™®Î¶¨ (ÏòàÏïΩ): {reserved:.2f} MB")
else:
    print(f"CPU Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ: {psutil.Process(os.getpid()).memory_info().rss/(1024**2):.2f} MB")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[INFO] device: cuda
[INFO] unique labels: [0, 1]


Map:   0%|          | 0/826 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.23921,0.951691,0.939597,0.992908,0.965517
2,No log,0.229788,0.956522,0.945946,0.992908,0.968858
3,No log,0.229716,0.961353,0.971631,0.971631,0.971631



[INFO] Applying pruning ...
Layer roberta.encoder.layer.0.attention.self.query | Sparsity: 50.00%
Layer roberta.encoder.layer.0.attention.self.key | Sparsity: 50.00%
Layer roberta.encoder.layer.0.attention.self.value | Sparsity: 50.00%
Layer roberta.encoder.layer.0.attention.output.dense | Sparsity: 50.00%
Layer roberta.encoder.layer.0.intermediate.dense | Sparsity: 50.00%
Layer roberta.encoder.layer.0.output.dense | Sparsity: 50.00%
Layer roberta.encoder.layer.1.attention.self.query | Sparsity: 50.00%
Layer roberta.encoder.layer.1.attention.self.key | Sparsity: 50.00%
Layer roberta.encoder.layer.1.attention.self.value | Sparsity: 50.00%
Layer roberta.encoder.layer.1.attention.output.dense | Sparsity: 50.00%
Layer roberta.encoder.layer.1.intermediate.dense | Sparsity: 50.00%
Layer roberta.encoder.layer.1.output.dense | Sparsity: 50.00%
Layer roberta.encoder.layer.2.attention.self.query | Sparsity: 50.00%
Layer roberta.encoder.layer.2.attention.self.key | Sparsity: 50.00%
Layer roberta

=== Confusion Matrix ===
[[ 62   4]
 [  7 134]]
=== Classification Report ===
              precision    recall  f1-score   support

           0     0.8986    0.9394    0.9185        66
           1     0.9710    0.9504    0.9606       141

    accuracy                         0.9469       207
   macro avg     0.9348    0.9449    0.9395       207
weighted avg     0.9479    0.9469    0.9472       207


=== Ï∂îÎ°† ÏÜçÎèÑ ===
ÌèâÍ∑† Latency: 0.32 ms/ÏÉòÌîå
Throughput : 3167.08 ÏÉòÌîå/Ï¥à

=== ÌååÎùºÎØ∏ÌÑ∞ Ïàò ===
Ï†ÑÏ≤¥ ÌååÎùºÎØ∏ÌÑ∞ Ïàò: 68.09 M
ÌïôÏäµÍ∞ÄÎä• ÌååÎùºÎØ∏ÌÑ∞ Ïàò: 68.09 M

=== Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ===
GPU Î©îÎ™®Î¶¨ (Ìï†Îãπ): 1866.37 MB
GPU Î©îÎ™®Î¶¨ (ÏòàÏïΩ): 2422.00 MB


# QLoRA + KD

In [None]:
# -*- coding: utf-8 -*-
import os, random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
)

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# ===============================
# 0) ÏãúÎìú Í≥†Ï†ï
# ===============================
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] device: {device}")

# ===============================
# 1) Îç∞Ïù¥ÌÑ∞ Î°úÎî©
# ===============================
CSV_PATH = r"/home/piai/PoscoAIproject/dataset/llm_datasets_0817_non_utf8.csv"
df = pd.read_csv(CSV_PATH)

df = df.dropna(subset=["label", "text"]).copy()
df = df[df["label"].isin([0, 1])].copy()
df["label"] = df["label"].astype(np.int64)
df["text"]  = df["text"].astype(str)
df = df.reset_index(drop=True)
print("[INFO] unique labels:", sorted(df["label"].unique()))

# ===============================
# 2) Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=SEED
)

train_ds = Dataset.from_dict({
    "text": X_train.tolist(),
    "labels": y_train.astype(np.int64).tolist()
})
test_ds  = Dataset.from_dict({
    "text": X_test.tolist(),
    "labels": y_test.astype(np.int64).tolist()
})

# ===============================
# 3) Tokenizer
# ===============================
teacher_model_name = "klue/roberta-base"
student_model_name = "klue/roberta-small"

tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_enc = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_enc  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===============================
# 4) Î™®Îç∏ Î°úÎî© (Teacher / Student QLoRA)
# ===============================
# Teacher (float32 Ïú†ÏßÄ)
teacher = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name, num_labels=2,
    torch_dtype=torch.float32,
    device_map="auto"
)
teacher.eval()

# Student (BaseÎäî 4bit, HeadÎäî float)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 1) Base encoderÎßå 4bitÎ°ú Î°úÎìú
base_model = AutoModel.from_pretrained(
    student_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# 2) Classifier headÎäî floatÎ°ú Î°úÎìú
student = AutoModelForSequenceClassification.from_pretrained(
    student_model_name,
    quantization_config=None,   # classifierÎäî ÏñëÏûêÌôî Ïïà Ìï®
    num_labels=2,
    device_map="auto"
)

# 3) Base encoder ÍµêÏ≤¥ + classifierÎäî float Î≥¥Ïû•
student.roberta = base_model
student.classifier = student.classifier.to(torch.float32)

# --- ÌïµÏã¨: Ìï≠ÏÉÅ dict Ï∂úÎ†• Í∞ïÏ†ú ---
if hasattr(teacher, "config"): teacher.config.return_dict = True
if hasattr(student, "config"): student.config.return_dict = True
if hasattr(base_model, "config"): base_model.config.return_dict = True

# 4) QLoRA Ï§ÄÎπÑ
student.gradient_checkpointing_enable()
student.config.use_cache = False
student = prepare_model_for_kbit_training(student)

# 5) LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value", "dense"],  # RoBERTa Íµ¨Ï°∞ Í∏∞Ï§Ä
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)
student = get_peft_model(student, lora_config)

# ===============================
# 5) Distillation Trainer Ï†ïÏùò
# ===============================
def _get_logits(outputs):
    # ModelOutput(dict-Ïú†ÏÇ¨) ÎòêÎäî tuple Î™®Îëê ÏïàÏ†Ñ Ï≤òÎ¶¨
    if hasattr(outputs, "logits"):
        return outputs.logits
    if isinstance(outputs, (tuple, list)):
        return outputs[0]
    raise TypeError("Model outputs do not contain logits.")

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model.eval()
        self.temperature = temperature
        self.alpha = alpha
        self.ce_loss = nn.CrossEntropyLoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"].view(-1).to(torch.long)

        # Student forward (dict Í∞ïÏ†ú + ÏïàÏ†Ñ Ï∂îÏ∂ú)
        student_inputs = {k: v for k, v in inputs.items() if k != "labels"}
        outputs_student = model(**student_inputs, return_dict=True)
        student_logits = _get_logits(outputs_student).to(torch.float32)
        loss_ce = self.ce_loss(student_logits, labels)

        # Teacher forward (dict Í∞ïÏ†ú + ÏïàÏ†Ñ Ï∂îÏ∂ú)
        teacher_device = next(self.teacher.parameters()).device
        teacher_inputs = {k: v.to(teacher_device) for k, v in student_inputs.items()}
        with torch.no_grad():
            outputs_teacher = self.teacher(**teacher_inputs, return_dict=True)
            teacher_logits = _get_logits(outputs_teacher).to(torch.float32)

        # KD loss
        T = self.temperature
        loss_kd = F.kl_div(
            F.log_softmax(student_logits / T, dim=-1),
            F.softmax(teacher_logits / T, dim=-1),
            reduction="batchmean"
        ) * (T * T)

        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kd
        return (loss, {"logits": student_logits}) if return_outputs else loss

# ===============================
# 6) ÌïôÏäµ ÏÑ§Ï†ï
# ===============================
training_args = TrainingArguments(
    output_dir="./qlora-distilled-roberta-small",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # <- Î≥¥ÌÜµ 'f1'Î°ú Îë°ÎãàÎã§
    greater_is_better=True,
    save_total_limit=2,
    fp16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.1,
    report_to="none",
    save_safetensors=False
)

# ===============================
# 7) ÌèâÍ∞ÄÏßÄÌëú
# ===============================
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    if isinstance(logits, (tuple, list)):
        logits = logits[0]
    preds = logits.argmax(axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

# ===============================
# 8) Trainer Ïã§Ìñâ
# ===============================
trainer = DistillationTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=train_enc,
    eval_dataset=test_enc,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# ÌïôÏäµ ÎÅùÎÇú Îí§
trainer.model.save_pretrained("/home/piai/PoscoAIproject/qlora-distilled-roberta-small")
tokenizer.save_pretrained("/home/piai/PoscoAIproject/qlora-distilled-roberta-small")

# ===============================
# 9) Confusion Matrix Ï∂úÎ†•
# ===============================
out = trainer.predict(test_enc)
logits = out.predictions if not isinstance(out.predictions, (tuple, list)) else out.predictions[0]
probs = torch.softmax(torch.from_numpy(logits), dim=1).numpy()
y_true = out.label_ids
y_pred = probs.argmax(axis=-1)

cm = confusion_matrix(y_true, y_pred)
print("=== Confusion Matrix ===")
print(cm)
print("=== Classification Report ===")
print(classification_report(y_true, y_pred, digits=4))

# 10) Ï∂îÍ∞Ä ÏÑ±Îä•/Ìö®Ïú® ÏßÄÌëú  (ÏàòÏ†ïÎ≥∏)
# ===============================
import time, psutil
from torch.utils.data import DataLoader

# ‚úÖ Î∞òÎìúÏãú ÌÜ†ÌÅ¨ÎÇòÏù¥Ï¶àÎêú ÌèâÍ∞ÄÏÖã ÏÇ¨Ïö©
eval_loader = DataLoader(test_enc, batch_size=32, collate_fn=data_collator)

# Î∞∞Ïπò ÌïòÎÇò Í∞ÄÏ†∏Ïò§Í∏∞
batch = next(iter(eval_loader))  # dict_keys(['input_ids','attention_mask','labels'])
# ÎùºÎ≤® Ï†úÍ±∞ + ÎîîÎ∞îÏù¥Ïä§ Ïù¥Îèô
batch_no_label = {k: v.to(device) for k, v in batch.items() if k != "labels"}

# ‚úÖ Î≥ÄÏàòÎ™Ö ÏùºÏπò: student Î™®Îç∏Î°ú Ï∂îÎ°†
student.eval()
with torch.no_grad():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.reset_peak_memory_stats()  # ÌîºÌÅ¨ Î©îÎ™®Î¶¨ Î¶¨ÏÖã
    t0 = time.time()
    _ = student(**batch_no_label)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t1 = time.time()

B = batch_no_label["input_ids"].size(0)
latency = (t1 - t0) / B
throughput = B / (t1 - t0)

print(f"\n=== Ï∂îÎ°† ÏÜçÎèÑ ===")
print(f"ÌèâÍ∑† Latency: {latency*1000:.2f} ms/ÏÉòÌîå")
print(f"Throughput : {throughput:.2f} ÏÉòÌîå/Ï¥à")

# (2) ÌååÎùºÎØ∏ÌÑ∞ Ïàò ‚Äî student Í∏∞Ï§Ä
total_params = sum(p.numel() for p in student.parameters())
trainable_params = sum(p.numel() for p in student.parameters() if p.requires_grad)
print(f"\n=== ÌååÎùºÎØ∏ÌÑ∞ Ïàò ===")
print(f"Ï†ÑÏ≤¥ ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {total_params/1e6:.2f} M")
print(f"ÌïôÏäµÍ∞ÄÎä• ÌååÎùºÎØ∏ÌÑ∞ Ïàò: {trainable_params/1e6:.2f} M")

# (3) Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ
print(f"\n=== Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ===")
if torch.cuda.is_available():
    allocated = torch.cuda.max_memory_allocated(device) / (1024**2)
    reserved  = torch.cuda.max_memory_reserved(device)  / (1024**2)
    print(f"GPU Î©îÎ™®Î¶¨ (Ìï†Îãπ): {allocated:.2f} MB")
    print(f"GPU Î©îÎ™®Î¶¨ (ÏòàÏïΩ): {reserved:.2f} MB")
else:
    print(f"CPU Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ: {psutil.Process(os.getpid()).memory_info().rss/(1024**2):.2f} MB")



[INFO] device: cuda


FileNotFoundError: [Errno 2] No such file or directory: '/home/piai/PoscoAIproject/dataset/llm_datasets_0817_non_utf8.csv'