In [17]:
from datasets import load_dataset   
from torch.utils.data import Dataset   
from torch.utils.data import DataLoader   
import torch   
from transformers import AutoModelForSequenceClassification   
from torch import nn, optim   
from torch.nn import functional as F   
from transformers import AutoTokenizer   
from tqdm import tqdm   
from time import perf_counter
from typing import Dict, List, Tuple, Optional, Any
import pandas as pd
import numpy as np
import warnings 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

warnings.filterwarnings('ignore')

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Apple GPU via Metal
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [6]:
class TextDataset(Dataset):
    """Custom dataset class for text classification with tokenization."""
    
    def __init__(self, data: Any, tokenizer: Any, max_length: int = 150):
        """
        Initialize the dataset.
        
        Args:
            data: Dataset containing 'label' and 'text' fields
            tokenizer: Tokenizer for text processing
            max_length: Maximum sequence length for tokenization
        """
        self.targets = torch.tensor(data['label'])
        texts = data['text']
        
        tokens = tokenizer(
            texts, 
            return_tensors='pt', 
            truncation=True, 
            padding='max_length', 
            max_length=max_length
        )
        
        self.input_ids = tokens['input_ids']
        self.attention_mask = tokens['attention_mask']
        self.length = len(texts)
    
    def __len__(self) -> int:
        return self.length
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        return self.input_ids[index], self.attention_mask[index], self.targets[index]


class DataManager:
    """Manages data loading and preprocessing for knowledge distillation."""
    
    def __init__(self, dataset_name: str, tokenizer: Any, test_size: float = 0.2, 
                 max_length: int = 150, batch_size: int = 32):
        """
        Initialize data manager.
        
        Args:
            dataset_name: Name of the dataset to load
            tokenizer: Tokenizer for text processing
            test_size: Fraction of data to use for validation
            max_length: Maximum sequence length
            batch_size: Batch size for data loaders
        """
        self.dataset_name = dataset_name
        self.tokenizer = tokenizer
        self.test_size = test_size
        self.max_length = max_length
        self.batch_size = batch_size
        
        self.train_loader = None
        self.valid_loader = None
        self.test_loader = None
    
    def prepare_data(self) -> Tuple[DataLoader, DataLoader, DataLoader]:
        """
        Load and prepare data loaders.
        
        Returns:
            Tuple of (train_loader, valid_loader, test_loader)
        """
        # Load dataset
        data = load_dataset(self.dataset_name)
        
        # Split data
        train_test = data['train'].train_test_split(test_size=self.test_size, shuffle=True)
        train_data = train_test['train']
        valid_data = train_test['test']
        test_data = data['test']
        
        # Create custom datasets
        train_dataset = TextDataset(train_data, self.tokenizer, self.max_length)
        valid_dataset = TextDataset(valid_data, self.tokenizer, self.max_length)
        test_dataset = TextDataset(test_data, self.tokenizer, self.max_length)
        
        # Create data loaders
        self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size)
        self.valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size)
        self.test_loader = DataLoader(test_dataset, batch_size=self.batch_size)
        
        return self.train_loader, self.valid_loader, self.test_loader


In [10]:
# Load model and tokenizer
save_path = "/Users/arsalsyed/Documents/student_model_distilled"
student_model = AutoModelForSequenceClassification.from_pretrained(save_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(save_path)

data_manager = DataManager(
    dataset_name="ag_news",
    tokenizer=tokenizer,
    test_size=0.2,
    max_length=150,
    batch_size=64
)

train_loader, valid_loader, test_loader = data_manager.prepare_data()

In [15]:
# def accuracy_score(batch, model):
#   with torch.no_grad():
#     outputs = model(
#         batch[0].to(device),
#         batch[1].to(device)
#     )
#     logits = outputs.logits
#     probabilities = torch.softmax(logits, dim = 1)
#     class_predictions = torch.argmax(probabilities, dim = 1)
#     acc = torch.mean((class_predictions == batch[2].to(device)).to(torch.float)).data.item()
#     return acc

def get_predictions(batch, model):
    """Helper function to get predictions from model"""
    # Your batch has 3 tensors: input_ids, attention_mask, labels
    input_ids, attention_mask, labels = batch
    
    # Get the device of the model
    device = next(model.parameters()).device
    
    # Move tensors to the same device as the model
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    
    with torch.no_grad():
        # Pass both input_ids and attention_mask to the model
        outputs = model(input_ids, attention_mask=attention_mask)
        # Handle different model output formats
        if hasattr(outputs, 'logits'):
            logits = outputs.logits
        else:
            logits = outputs
        predictions = torch.argmax(logits, dim=1)
    
    return predictions.cpu().numpy(), labels.cpu().numpy()

In [18]:
student_predictions = []
student_labels = []
accuracy_student = 0.0
time_taken_student = 0.0

# Evaluation loop
print("Evaluating student model on test dataset...")
for batch in tqdm(test_loader):
    # Student model evaluation
    start_time = perf_counter()
    student_preds, true_labels = get_predictions(batch, student_model)
    end_time = perf_counter()
    
    student_predictions.extend(student_preds)
    student_labels.extend(true_labels)
    
    # Calculate batch accuracy for student
    batch_acc = accuracy_score(true_labels, student_preds)
    accuracy_student += batch_acc
    time_taken_student += end_time - start_time

# Convert to numpy arrays
student_predictions = np.array(student_predictions)
student_labels = np.array(student_labels)

# AG News dataset has 4 classes
class_names = ['World', 'Sports', 'Business', 'Sci/Tech']

# Calculate comprehensive metrics for student model
accuracy = accuracy_score(student_labels, student_predictions)
precision, recall, f1, support = precision_recall_fscore_support(student_labels, student_predictions, average='weighted')
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(student_labels, student_predictions, average='macro')

print(f"\n{'='*60}")
print("Overall Accuracy")
print(f"{'='*60}")
print(accuracy)

Evaluating student model on test dataset...


100%|███████████████████████████████████████████████| 119/119 [00:41<00:00,  2.86it/s]


Overall Accuracy
0.9469736842105263





In [19]:
print(f"\n{'='*60}")
print("DETAILED CLASSIFICATION REPORT")
print(f"{'='*60}")
print(classification_report(student_labels, student_predictions, 
                          target_names=class_names, digits=4))


DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

       World     0.9644    0.9547    0.9595      1900
      Sports     0.9874    0.9884    0.9879      1900
    Business     0.9216    0.9153    0.9184      1900
    Sci/Tech     0.9150    0.9295    0.9222      1900

    accuracy                         0.9470      7600
   macro avg     0.9471    0.9470    0.9470      7600
weighted avg     0.9471    0.9470    0.9470      7600



In [23]:
dataset_ckpt = 'ag_news'  
teacher_model_ckpt = "fabriceyhc/bert-base-uncased-ag_news" # our already finetuned teacher model   
student_model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(student_model_ckpt)

In [24]:
class MyData(Dataset):
  def __init__(self, data):
    targets = data['label']
    texts = data['text']

    tokens = tokenizer(texts, return_tensors = 'pt', truncation = True, padding = 'max_length', max_length = 150)
    self.input_ids = tokens['input_ids']
    self.attention_mask = tokens['attention_mask']
    self.targets = torch.tensor(targets)
    self.length = len(texts)
  def __len__(self):
    return self.length
  def __getitem__(self, index):
    return self.input_ids[index], self.attention_mask[index], self.targets[index]

data = load_dataset(dataset_ckpt)
train_test = data['train'].train_test_split(test_size = 0.2)
valid_data = train_test['test']#.select(range(100))
train_data = train_test['train']#.select(range(100))
test_data = data['test']#.select(range(100))

def get_num_rows(dataset):
  return dataset.num_rows

print(f'Train set has {get_num_rows(train_data)} samples')
print(f'Validation set has {get_num_rows(valid_data)} samples')
print(f'Test set has {get_num_rows(test_data)} samples')

train_data = MyData(train_data)
valid_data = MyData(valid_data)
test_data = MyData(test_data)

# now we build our loaders
batch_size = 64
train_loader = DataLoader(train_data,batch_size = batch_size)
valid_loader = DataLoader(valid_data, batch_size = batch_size)
test_loader = DataLoader(test_data, batch_size = batch_size)


Train set has 96000 samples
Validation set has 24000 samples
Test set has 7600 samples


In [25]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Apple GPU via Metal
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

save_path = "/Users/arsalsyed/Documents/student_model_distilled"
student_model = AutoModelForSequenceClassification.from_pretrained(save_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(save_path)


Using device: mps


In [26]:
def accuracy_score(batch, model):
  with torch.no_grad():
    outputs = model(
        batch[0].to(device),
        batch[1].to(device)
    )
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim = 1)
    class_predictions = torch.argmax(probabilities, dim = 1)
    acc = torch.mean((class_predictions == batch[2].to(device)).to(torch.float)).data.item()
    return acc

In [27]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from time import perf_counter
import numpy as np
from tqdm import tqdm

def get_predictions(batch, model):
    """Helper function to get predictions from model"""
    # Your batch has 3 tensors: input_ids, attention_mask, labels
    input_ids, attention_mask, labels = batch
    
    # Get the device of the model
    device = next(model.parameters()).device
    
    # Move tensors to the same device as the model
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    
    with torch.no_grad():
        # Pass both input_ids and attention_mask to the model
        outputs = model(input_ids, attention_mask=attention_mask)
        # Handle different model output formats
        if hasattr(outputs, 'logits'):
            logits = outputs.logits
        else:
            logits = outputs
        predictions = torch.argmax(logits, dim=1)
    
    return predictions.cpu().numpy(), labels.cpu().numpy()

In [28]:
student_predictions = []
student_labels = []

# Initialize timing variables
accuracy_student = 0.0
time_taken_student = 0.0

print("Evaluating student model on test dataset...")

for batch in tqdm(test_loader):
    # Student model evaluation
    start_time = perf_counter()
    student_preds, true_labels = get_predictions(batch, student_model)
    end_time = perf_counter()
    
    student_predictions.extend(student_preds)
    student_labels.extend(true_labels)
    
    # Calculate batch accuracy for student
    batch_acc = accuracy_score(true_labels, student_preds)
    accuracy_student += batch_acc
    time_taken_student += end_time - start_time

# Convert to numpy arrays
student_predictions = np.array(student_predictions)
student_labels = np.array(student_labels)

# AG News dataset has 4 classes
class_names = ['World', 'Sports', 'Business', 'Sci/Tech']

# Calculate comprehensive metrics for student model
accuracy = accuracy_score(student_labels, student_predictions)
precision, recall, f1, support = precision_recall_fscore_support(student_labels, student_predictions, average='weighted')
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(student_labels, student_predictions, average='macro')


Using device: mps
Evaluating student model on test dataset...


100%|███████████████████████████████████████████████| 119/119 [00:44<00:00,  2.70it/s]


In [29]:
print(f"\n{'='*60}")
print("DETAILED CLASSIFICATION REPORT")
print(f"{'='*60}")
print(classification_report(student_labels, student_predictions, 
                          target_names=class_names, digits=4))


DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

       World     0.9644    0.9547    0.9595      1900
      Sports     0.9874    0.9884    0.9879      1900
    Business     0.9216    0.9153    0.9184      1900
    Sci/Tech     0.9150    0.9295    0.9222      1900

    accuracy                         0.9470      7600
   macro avg     0.9471    0.9470    0.9470      7600
weighted avg     0.9471    0.9470    0.9470      7600

