# Setup

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from pathlib import Path

# Path to the saved checkpoint
checkpoint_path = "./results/roberta/checkpoint-9900/"

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# Load the teacher model from the checkpoint directory
teacher_model = RobertaForSequenceClassification.from_pretrained(checkpoint_path)

# Load the student model
student_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load data

In [6]:
export_dir = Path(os.getcwd())
data_path = Path(export_dir, "data")
train_file = Path(data_path, 'train_data_only_text_and_labels.csv')
eval_file = Path(data_path, 'eval_data_only_text_and_labels.csv')
test_file = Path(data_path, 'test_data_only_text_and_labels.csv')


In [7]:
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
test_df = pd.read_csv(test_file)

# Convert to Dataset
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df)


In [9]:

# Tokenize and prepare datasets
def tokenize_and_encode(df):
    encodings = tokenizer(list(df['text_combined']), truncation=True, padding=True, return_tensors='pt')
    labels = torch.tensor(df['label'].values)
    return encodings, labels

train_encodings, train_labels = tokenize_and_encode(train_df)
eval_encodings, eval_labels = tokenize_and_encode(eval_df)
test_encodings, test_labels = tokenize_and_encode(test_df)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
eval_dataset = CustomDataset(eval_encodings, eval_labels)
test_dataset = CustomDataset(test_encodings, test_labels)


## Distillation Training

In [10]:
training_args = TrainingArguments(
    output_dir="./results/roberta/fine_tune/distilled_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./results/roberta/fine_tune/distilled_model/logs",
    load_best_model_at_end=True,
)

def compute_metrics(p):
    preds = torch.tensor(np.argmax(p.predictions, axis=1))
    labels = torch.tensor(p.label_ids)

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(labels, preds)
    
    # Extract TP, FP, TN, FN from confusion matrix
    TN, FP, FN, TP = conf_matrix.ravel() if conf_matrix.size == 4 else (0, 0, 0, 0)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'TP': TP,
        'FP': FP,
        'TN': TN,
        'FN': FN
    }

trainer = Trainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Tp,Fp,Tn,Fn
1,0.0735,0.064666,0.986437,0.986501,0.986437,0.986434,6813,129,6206,50
2,0.0695,0.063466,0.987953,0.988018,0.987953,0.987955,6747,43,6292,116
3,0.0202,0.031794,0.99265,0.99266,0.99265,0.992651,6801,35,6300,62


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=9900, training_loss=0.07432453906897342, metrics={'train_runtime': 4585.6838, 'train_samples_per_second': 34.536, 'train_steps_per_second': 2.159, 'total_flos': 4.16688978373632e+16, 'train_loss': 0.07432453906897342, 'epoch': 3.0})

## Save Metrics and Model Information

In [11]:
# Evaluate on test dataset
results = trainer.evaluate(test_dataset)
torch.save(student_model.state_dict(), "./results/roberta/fine_tune/distilled_model/student_model.pt")
# Get model size and number of parameters
model_size = sum(p.numel() for p in student_model.parameters())
model_size_mb = os.path.getsize("./results/roberta/fine_tune/distilled_model/student_model.pt") / (1024 * 1024)

# Prepare results and model info
metrics = results.copy()  # Create a copy to add additional columns
metrics["model_size_mb"] = model_size_mb
metrics["num_parameters"] = model_size

# Convert to DataFrame and save to CSV
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv("./results/roberta/fine_tune/distilled_model/metrics.csv", index=False)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
