In [None]:
# Install required packages for notebook environment
!pip install pandas==2.1.3 numpy==1.26.3 matplotlib seaborn scikit-learn torch torchvision torchaudio transformers==4.44.2 safetensors==0.4.2 datasets optuna wandb sentencepiece accelerate==0.33.0 evaluate --quiet

Collecting transformers==4.39.3
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting safetensors==0.4.2
  Using cached safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.39.3)
  Using cached tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.39.3-py3-none-any.whl (8.8 MB)
Using cached safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Using cached tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: safetensors, tokenizers, transformers
  Attempting uninstall: safetensors
    Found existing installation: safetensors 0.6.2
    Uninstalling safetensors-0.6.2:
      Successfully uninstalled safetensors-0.6.2
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.4
  

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import wandb
import os
import warnings
warnings.filterwarnings('ignore')

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
print(f"Using device: {DEVICE}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [3]:
# Quick test to verify PyTorch checkpoints load correctly
def test_checkpoints():
    print("Testing PyTorch checkpoint loading...")

    # Test DeBERTa checkpoint
    try:
        print("\n1. Testing DeBERTa checkpoint...")
        deberta_model = AutoModelForSequenceClassification.from_pretrained(
            "agentlans/deberta-v3-base-tweet-sentiment",
            num_labels=5,ignore_mismatched_sizes=True
        )
        ckpt = torch.load(
            "/storage/yagel/ADL/checkpoints/deberta_hp_tuning_study/best_model/best_model.pt",
            map_location="cpu"
        )
        deberta_model.load_state_dict(ckpt['model_state_dict'], strict=False)
        print("✅ DeBERTa checkpoint loaded successfully!")
        print(f"   Model type: {type(deberta_model).__name__}")
        print(f"   Number of parameters: {sum(p.numel() for p in deberta_model.parameters()):,}")
        del deberta_model

    except Exception as e:
        print(f"❌ DeBERTa checkpoint failed: {e}")

    # Test RoBERTa checkpoint
    try:
        print("\n2. Testing RoBERTa checkpoint...")
        roberta_model = AutoModelForSequenceClassification.from_pretrained(
            "cardiffnlp/twitter-roberta-base-sentiment-latest",
            num_labels=5,
            ignore_mismatched_sizes=True
        )
        ckpt = torch.load(
            "/storage/yagel/ADL/checkpoints/roberta_hp_tuning_study/best_model/best_model.pt",
            map_location="cpu"
        )
        roberta_model.load_state_dict(ckpt['model_state_dict'], strict=False)
        print("✅ RoBERTa checkpoint loaded successfully!")
        print(f"   Model type: {type(roberta_model).__name__}")
        print(f"   Number of parameters: {sum(p.numel() for p in roberta_model.parameters()):,}")
        del roberta_model

    except Exception as e:
        print(f"❌ RoBERTa checkpoint failed: {e}")

    print("\n" + "="*50)
    print("Checkpoint test completed!")

# Run the test
test_checkpoints()

Testing PyTorch checkpoint loading...

1. Testing DeBERTa checkpoint...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at agentlans/deberta-v3-base-tweet-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ DeBERTa checkpoint loaded successfully!
   Model type: DebertaV2ForSequenceClassification
   Number of parameters: 184,425,989

2. Testing RoBERTa checkpoint...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

✅ RoBERTa checkpoint loaded successfully!
   Model type: RobertaForSequenceClassification
   Number of parameters: 124,649,477

Checkpoint test completed!


In [None]:
# Test loading HuggingFace checkpoints for RoBERTa and DeBERTa

def test_hf_checkpoints():
    print("Testing HuggingFace checkpoints...")

    # Test DeBERTa HF checkpoint
    try:
        print("\n1. Testing DeBERTa HF checkpoint...")
        deberta_model = AutoModelForSequenceClassification.from_pretrained(
            "/storage/yagel/ADL/deberta_results_HF/deberta_trial_4/checkpoint-4950"
        )
        print("✅ DeBERTa HF checkpoint loaded successfully!")
        print(f"   Model type: {type(deberta_model).__name__}")
        print(f"   Number of parameters: {sum(p.numel() for p in deberta_model.parameters()):,}")
        del deberta_model
    except Exception as e:
        print(f"❌ DeBERTa HF checkpoint failed: {e}")

    # Test RoBERTa HF checkpoint
    try:
        print("\n2. Testing RoBERTa HF checkpoint...")
        roberta_model = AutoModelForSequenceClassification.from_pretrained(
            "/storage/yagel/ADL/roberta_results_HF/trial_7/checkpoint-18000"
        )
        print("✅ RoBERTa HF checkpoint loaded successfully!")
        print(f"   Model type: {type(roberta_model).__name__}")
        print(f"   Number of parameters: {sum(p.numel() for p in roberta_model.parameters()):,}")
        del roberta_model
    except Exception as e:
        print(f"❌ RoBERTa HF checkpoint failed: {e}")

    print("\n" + "="*50)
    print("HF Checkpoint test completed!")

# Run the test
test_hf_checkpoints()

In [4]:
def load_models():
    models = {}

    # Model 1: HuggingFace checkpoint 1 - DeBERTa (from checkpoint directory)
    print("Loading HF Model 1 - DeBERTa...")
    models['hf_deberta'] = AutoModelForSequenceClassification.from_pretrained(
        "/storage/yagel/ADL/deberta_results_HF/deberta_trial_4/checkpoint-4950"
    ).to(DEVICE)

    # Model 2: HuggingFace checkpoint 2 - RoBERTa (from checkpoint directory)
    print("Loading HF Model 2 - RoBERTa...")
    models['hf_roberta'] = AutoModelForSequenceClassification.from_pretrained(
        "/storage/yagel/ADL/roberta_results_HF/trial_7/checkpoint-18000"
    ).to(DEVICE)

    # Model 3: PyTorch checkpoint 1 - DeBERTa (fine-tuned)
    print("Loading PT Model 1 - DeBERTa (fine-tuned)...")
    models['pt_deberta'] = AutoModelForSequenceClassification.from_pretrained(
        "agentlans/deberta-v3-base-tweet-sentiment",
        num_labels=5,
        ignore_mismatched_sizes=True
    )
    deberta_ckpt = torch.load("/storage/yagel/ADL/checkpoints/deberta_hp_tuning_study/best_model/best_model.pt", map_location=DEVICE)
    models['pt_deberta'].load_state_dict(deberta_ckpt['model_state_dict'], strict=False)
    models['pt_deberta'] = models['pt_deberta'].to(DEVICE)

    # Model 4: PyTorch checkpoint 2 - RoBERTa (fine-tuned)
    print("Loading PT Model 2 - RoBERTa (fine-tuned)...")
    models['pt_roberta'] = AutoModelForSequenceClassification.from_pretrained(
        "cardiffnlp/twitter-roberta-base-sentiment-latest",
        num_labels=5,
        ignore_mismatched_sizes=True
    )
    roberta_ckpt = torch.load("/storage/yagel/ADL/checkpoints/roberta_hp_tuning_study/best_model/best_model.pt", map_location=DEVICE)
    models['pt_roberta'].load_state_dict(roberta_ckpt['model_state_dict'], strict=False)
    models['pt_roberta'] = models['pt_roberta'].to(DEVICE)

    return models

# Load all models
models = load_models()
print(f"Loaded {len(models)} models: {list(models.keys())}")

Loading HF Model 1 - DeBERTa...
Loading HF Model 2 - RoBERTa...
Loading PT Model 1 - DeBERTa (fine-tuned)...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at agentlans/deberta-v3-base-tweet-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading PT Model 2 - RoBERTa (fine-tuned)...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Loaded 4 models: ['hf_deberta', 'hf_roberta', 'pt_deberta', 'pt_roberta']


In [5]:
train_df = pd.read_csv("data/train_df.csv")
eval_df  = pd.read_csv("data/eval_df.csv")
test_df  = pd.read_csv("data/test_df.csv")

# ensure int labels, keep your columns
for df in (train_df, eval_df, test_df):
    df["label"] = pd.to_numeric(df["label"], errors="coerce").astype(int)
    assert df["label"].between(0, 4).all(), f"Labels out of range in {df.shape}: {set(df['label'])}"

train_df = train_df[["CleanTweet","label"]].reset_index(drop=True)
eval_df  = eval_df[["CleanTweet","label"]].reset_index(drop=True)
test_df  = test_df[["CleanTweet","label"]].reset_index(drop=True)

class TweetsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.texts = dataframe["CleanTweet"].tolist()
        self.labels = dataframe["label"].astype(int).tolist()  # ensure plain ints
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),       # squeeze batch dim only
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

tokenizer = AutoTokenizer.from_pretrained("agentlans/deberta-v3-base-tweet-sentiment", use_fast=False)

train_dataset = TweetsDataset(train_df, tokenizer)
val_dataset   = TweetsDataset(eval_df,  tokenizer)
test_dataset  = TweetsDataset(test_df,  tokenizer)

train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=5, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=5, shuffle=False)

print(f"Dataset sizes - Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")


Dataset sizes - Train: 28800, Val: 12343, Test: 3798


In [6]:
# Load model from checkpoint
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load base DeBERTa-v3 tweet sentiment model first, then load your trained weights
model = AutoModelForSequenceClassification.from_pretrained(
    'agentlans/deberta-v3-base-tweet-sentiment',  # CHANGE: Update to match your model architecture (e.g., 'bert-base-uncased', 'roberta-base')
    num_labels=5,              # CHANGE: Update number of labels to match your task
    ignore_mismatched_sizes=True
)

# Load your trained checkpoint weights
checkpoint_path = "checkpoints/deberta_hp_tuning_study/best_model/best_model.pt"  #manual code  CHANGE: Update path to your checkpoint file
checkpoint = torch.load(checkpoint_path, map_location=DEVICE)

# Extract model weights from checkpoint metadata
if 'model_state_dict' in checkpoint:
    model_weights = checkpoint['model_state_dict']
    print(f"Loaded checkpoint metadata - Study: {checkpoint.get('study_name', 'N/A')}")
    print(f"Best trial: {checkpoint.get('study_best_trial', 'N/A')}, Best accuracy: {checkpoint.get('best_val_accuracy', 'N/A')}")
else:
    model_weights = checkpoint

model.load_state_dict(model_weights)
model.to(DEVICE)

print(f"Model loaded from: {checkpoint_path}")
print(f"Model loaded on {DEVICE}")
print(f"Model type: {type(model).__name__}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at agentlans/deberta-v3-base-tweet-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded checkpoint metadata - Study: deberta_hp_tuning_study
Best trial: 0, Best accuracy: 0.7409867941343271
Model loaded from: checkpoints/deberta_hp_tuning_study/best_model/best_model.pt
Model loaded on cuda
Model type: DebertaV2ForSequenceClassification
Number of parameters: 184,425,989


In [7]:
test_df.head()

Unnamed: 0,CleanTweet,label
0,TRENDING: New Yorkers encounter empty supermar...,0
1,When I couldn't find hand sanitizer at Fred Me...,3
2,Find out how you can protect yourself and love...,4
3,Panic buying hits NewYork City as anxious shop...,1
4,toiletpaper dunnypaper coronavirus coronavirus...,2


# Quantization

In [None]:
# Function to measure model size in KB
def get_model_size(model, filename="temp.pth"):
    torch.save(model.state_dict(), filename)
    size = os.path.getsize(filename) / 1024  # KB
    # Clean up temporary file
    os.remove(filename)
    return size

# Function to evaluate accuracy
def evaluate_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}  # Move everything to the device

            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            predictions = outputs.logits.argmax(dim=1)
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    return correct / total


# --- Main test loop ---
def quantization_sweep(model, test_loader):
    results = []

    # Original FP32 model on GPU
    model.eval()
    fp32_size = get_model_size(model, "fp32.pth")
    fp32_acc = evaluate_accuracy(model, test_loader, DEVICE)
    results.append(("FP32 (baseline)", fp32_size, fp32_acc, 0.0))

    # INT8 Dynamic quantization (Linear layers only)
    # Move model to CPU for quantization
    model_cpu = model.cpu()
    int8_model = quantize_dynamic(model_cpu, {nn.Linear}, dtype=torch.qint8)
    int8_size = get_model_size(int8_model, "int8.pth")
    
    # Evaluate quantized model on CPU (quantized models can't run on GPU)
    int8_acc = evaluate_accuracy(int8_model, test_loader, torch.device('cpu'))
    results.append(("INT8 Linear", int8_size, int8_acc, 100*(1 - int8_size/fp32_size)))

    # Move original model back to GPU
    model.to(DEVICE)

    # Print results
    print(f"{'Format':<15} {'Size (KB)':<12} {'Accuracy':<10} {'% Size Saved':<12}")
    print("-" * 55)
    for name, size, acc, saving in results:
        print(f"{name:<15} {size:<12.1f} {acc:<10.4f} {saving:<12.1f}")

    return results

In [None]:
#DeBERTA manual code (pt checkpoint)
model.to(DEVICE)
results = quantization_sweep(model, test_loader)
print("Quantization results:", results)

In [None]:
#RoBERTA manual (pt checkpoint)


# Paths & backbone
roberta_backbone = "cardiffnlp/twitter-roberta-base-sentiment-latest"
roberta_ckpt     = "checkpoints/roberta_hp_tuning_study/best_model/best_model.pt"

# Build loader with the **matching tokenizer**
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_backbone, use_fast=True)
test_dataset = TweetsDataset(test_df, roberta_tokenizer, max_length=256)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Optional: sanity on label range for 5 classes
assert test_df["label"].between(0, 4).all(), "Labels must be in [0..4] for num_labels=5."
invalid_labels = test_df[~test_df["label"].between(0, 4)]
if not invalid_labels.empty:
    print("❌ Invalid labels found:\n", invalid_labels)
    raise ValueError("Test set contains invalid labels.")

# Load model + weights
roberta_model = AutoModelForSequenceClassification.from_pretrained(
    roberta_backbone, num_labels=5, ignore_mismatched_sizes=True
)
state = torch.load(roberta_ckpt, map_location=DEVICE)
roberta_model.load_state_dict(state["model_state_dict"])
roberta_model.eval()
roberta_model = roberta_model.to(DEVICE)

# Sanity check: token IDs fit vocab
batch = next(iter(test_loader))
max_id = int(batch["input_ids"].max())
vocab_sz = roberta_model.roberta.embeddings.word_embeddings.num_embeddings
print(f"Max token id = {max_id} | vocab size = {vocab_sz}")
assert max_id < vocab_sz, "Token ID exceeds model vocab size – tokenizer/model mismatch."
DEVICE = torch.device("cpu")
roberta_model.to(DEVICE)
results = quantization_sweep(roberta_model, test_loader)

print("RoBERTa (PT) Quantization results:", results)



In [None]:
# ===============================
# DeBERTa: HF checkpoint + sweep
# ===============================
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# DeBERTa paths and backbone
deberta_backbone = "agentlans/deberta-v3-base-tweet-sentiment"  # Or your custom one if different

# Tokenizer and loader
deberta_tokenizer = AutoTokenizer.from_pretrained(deberta_backbone, use_fast=True)
test_dataset = TweetsDataset(test_df, deberta_tokenizer, max_length=256)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Sanity check: label range
assert test_df["label"].between(0, 4).all(), "Labels must be in [0..4] for num_labels=5."

# Load model
deberta_model = AutoModelForSequenceClassification.from_pretrained(
    "deberta_results_HF/deberta_trial_4/checkpoint-4950", num_labels=5
)


# Run sweep on CPU
DEVICE = torch.device("cpu")
deberta_model.to(DEVICE)
results = quantization_sweep(deberta_model, test_loader)
print("DeBERTa (HF) Quantization results:", results)


In [None]:
# ===============================
# RoBERTa: HF CHECKPOINT + SWEEP
# ===============================
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Tokenizer from base model
roberta_backbone = "cardiffnlp/twitter-roberta-base-sentiment-latest"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_backbone, use_fast=True)

# Loader
test_dataset = TweetsDataset(test_df, roberta_tokenizer, max_length=256)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Sanity check
assert test_df["label"].between(0, 4).all(), "Labels must be in [0..4] for num_labels=5."

# ✅ Load model from local HF checkpoint
roberta_ckpt = "roberta_results_HF/trial_7/checkpoint-18000"
roberta_model = AutoModelForSequenceClassification.from_pretrained(
    roberta_ckpt,
    num_labels=5,
    local_files_only=True  # <<< this tells HF to treat the path as local
)

# Run sweep
DEVICE = torch.device("cpu")
roberta_model.to(DEVICE)
results = quantization_sweep(roberta_model, test_loader)

print("RoBERTa (HF checkpoint) Quantization results:", results)


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Pruning

In [None]:
import torch
import torch.nn.utils.prune as prune
import copy
import matplotlib.pyplot as plt
import pandas as pd

def pruning_tradeoff_sweep_cpu(model, test_loader, amounts=None):
    """
    CPU-only pruning sweep with results table and plot
    
    Args:
        model: Input model to prune
        test_loader: DataLoader for evaluation
        amounts: List of pruning amounts (default: 10%, 20%, 30%, 40%, 50%)
    
    Returns:
        results: List of dictionaries with pruning results
    """
    if amounts is None:
        amounts = [0.1, 0.2, 0.3, 0.4, 0.5]  # 10%, 20%, 30%, 40%, 50%
    
    print("🔬 Starting CPU Pruning Sweep")
    print("=" * 40)
    
    results = []
    
    for amount in amounts:
        print(f"\n📊 Testing pruning amount: {amount*100:.0f}%")
        
        # Create a copy of the model for this pruning amount
        model_to_prune = copy.deepcopy(model)
        
        # Get parameters to prune (all Linear layers)
        parameters_to_prune = []
        for name, module in model_to_prune.named_modules():
            if isinstance(module, torch.nn.Linear):
                parameters_to_prune.append((module, 'weight'))
        
        print(f"Found {len(parameters_to_prune)} Linear layers to prune")
        
        # Apply global unstructured pruning
        prune.global_unstructured(
            parameters_to_prune,
            pruning_method=prune.L1Unstructured,
            amount=amount
        )
        
        # Calculate sparsity
        total_params = 0
        zero_params = 0
        for name, module in model_to_prune.named_modules():
            if isinstance(module, torch.nn.Linear) and hasattr(module, 'weight_mask'):
                total_params += module.weight.numel()
                zero_params += (module.weight_mask == 0).sum().item()
        
        actual_sparsity = zero_params / total_params if total_params > 0 else 0
        print(f"Actual sparsity: {actual_sparsity:.3f} ({actual_sparsity*100:.1f}%)")
        
        # Evaluate accuracy on CPU
        acc = evaluate_accuracy_cpu(model_to_prune, test_loader)
        
        results.append({
            'pruning_amount': amount,
            'actual_sparsity': actual_sparsity,
            'accuracy': acc
        })
        
        print(f"Accuracy: {acc:.4f}")
        
        # Clean up
        del model_to_prune
    
    # Print final results table
    print("\n📈 PRUNING RESULTS SUMMARY")
    print("=" * 50)
    print("Pruning | Actual  | Accuracy")
    print("Amount  | Sparsity|         ")
    print("-" * 50)
    for result in results:
        print(f"{result['pruning_amount']*100:6.0f}% | {result['actual_sparsity']*100:6.1f}%  | {result['accuracy']:.4f}")
    
    # Create plot
    print("\n📊 Generating plot...")
    results_df = pd.DataFrame(results)
    
    plt.figure(figsize=(10, 6))
    plt.plot(results_df['actual_sparsity'] * 100, results_df['accuracy'], 'bo-', linewidth=2, markersize=8)
    plt.xlabel('Sparsity (%)', fontsize=12)
    plt.ylabel('Accuracy', fontsize=12)
    plt.title('Model Pruning: Sparsity vs Accuracy Trade-off', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.xticks(range(10, 60, 10))
    
    # Add accuracy values as annotations
    for i, row in results_df.iterrows():
        plt.annotate(f'{row["accuracy"]:.3f}', 
                    (row['actual_sparsity']*100, row['accuracy']), 
                    textcoords="offset points", xytext=(0,10), ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    print("✅ Pruning analysis complete!")
    
    return results

def evaluate_accuracy_cpu(model, test_loader):
    """
    CPU-only accuracy evaluation
    
    Args:
        model: Model to evaluate
        test_loader: DataLoader for evaluation
    
    Returns:
        accuracy: Float accuracy value
    """
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    
    return correct / total

In [None]:
# ===============================
# RoBERTa: PT checkpoint + PRUNING (CPU)
# ===============================
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEVICE = torch.device("cpu")

# Backbone + PT checkpoint path
roberta_backbone = "cardiffnlp/twitter-roberta-base-sentiment-latest"  # 3-class base head
roberta_pt_ckpt  = "checkpoints/roberta_hp_tuning_study/best_model/best_model.pt"

# Tokenizer & loader
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_backbone, use_fast=True)
test_dataset = TweetsDataset(test_df, roberta_tokenizer, max_length=256)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Load base arch w/ new 5-class head, then load your fine-tuned PT weights
roberta_model = AutoModelForSequenceClassification.from_pretrained(
    roberta_backbone, num_labels=5, ignore_mismatched_sizes=True
)
state = torch.load(roberta_pt_ckpt, map_location=DEVICE)
roberta_model.load_state_dict(state["model_state_dict"], strict=False)
roberta_model.to(DEVICE).eval()

# Run CPU pruning sweep
pruning_results_roberta_pt = pruning_tradeoff_sweep_cpu(
    roberta_model, test_loader, amounts=[0.1, 0.2, 0.3, 0.4, 0.5]
)
pruning_results_roberta_pt


In [None]:
# ===============================
# RoBERTa: HF checkpoint dir + PRUNING (CPU)
# ===============================
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEVICE = torch.device("cpu")

# Your HF checkpoint directory
roberta_hf_ckpt = "roberta_results_HF/trial_7/checkpoint-18000"

# Use tokenizer from backbone (checkpoint dir may not have tokenizer files)
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest", use_fast=True)
test_dataset = TweetsDataset(test_df, roberta_tokenizer, max_length=256)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Load model from local HF checkpoint dir
roberta_model_hf = AutoModelForSequenceClassification.from_pretrained(
    roberta_hf_ckpt, local_files_only=True
)
# (optional) enforce 5 labels if config differs
if roberta_model_hf.config.num_labels != 5:
    roberta_model_hf.classifier.out_proj = torch.nn.Linear(
        roberta_model_hf.classifier.out_proj.in_features, 5
    )
    roberta_model_hf.config.num_labels = 5

roberta_model_hf.to(DEVICE).eval()

# Run CPU pruning sweep
pruning_results_roberta_hf = pruning_tradeoff_sweep_cpu(
    roberta_model_hf, test_loader, amounts=[0.1, 0.2, 0.3, 0.4, 0.5]
)
pruning_results_roberta_hf


In [None]:
# ===============================
# DeBERTa: HF checkpoint dir + PRUNING (CPU)
# ===============================
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEVICE = torch.device("cpu")

# HF checkpoint directory you trained with Trainer.save_model / checkpoint-*
deberta_hf_ckpt = "deberta_results_HF/deberta_trial_4/checkpoint-4950"

# Use tokenizer from backbone (checkpoint dir often lacks tokenizer files)
deberta_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=True)
test_dataset = TweetsDataset(test_df, deberta_tokenizer, max_length=256)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Load model weights from local HF checkpoint dir
deberta_model_hf = AutoModelForSequenceClassification.from_pretrained(
    deberta_hf_ckpt, local_files_only=True   # treat as local path
)
# (optional) enforce num_labels=5 if your config wasn’t saved with 5:
if deberta_model_hf.config.num_labels != 5:
    deberta_model_hf.classifier = torch.nn.Linear(deberta_model_hf.classifier.in_features, 5)
    deberta_model_hf.config.num_labels = 5

deberta_model_hf.to(DEVICE).eval()

# Run CPU pruning sweep
pruning_results_deberta_hf = pruning_tradeoff_sweep_cpu(
    deberta_model_hf, test_loader, amounts=[0.1, 0.2, 0.3, 0.4, 0.5]
)
pruning_results_deberta_hf


In [None]:
# ===============================
# DeBERTa: PT checkpoint + PRUNING (CPU)
# ===============================
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEVICE = torch.device("cpu")

# Backbone + PT checkpoint path
deberta_backbone = "agentlans/deberta-v3-base-tweet-sentiment"
deberta_pt_ckpt  = "checkpoints/deberta_hp_tuning_study/best_model/best_model.pt"

# Tokenizer & loader (use backbone tokenizer)
deberta_tokenizer = AutoTokenizer.from_pretrained(deberta_backbone, use_fast=True)
test_dataset = TweetsDataset(test_df, deberta_tokenizer, max_length=256)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Load base arch then your fine-tuned PT weights
deberta_model = AutoModelForSequenceClassification.from_pretrained(
    deberta_backbone, num_labels=5, ignore_mismatched_sizes=True
)
state = torch.load(deberta_pt_ckpt, map_location=DEVICE)
deberta_model.load_state_dict(state["model_state_dict"], strict=False)
deberta_model.to(DEVICE).eval()

# Run CPU pruning sweep (10–50%)
pruning_results_deberta_pt = pruning_tradeoff_sweep_cpu(
    deberta_model, test_loader, amounts=[0.1, 0.2, 0.3, 0.4, 0.5]
)
pruning_results_deberta_pt


# Knowledge Distillation

In [None]:
# set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

✅ Knowledge distillation function loaded successfully!


In [None]:
# Extract all checkpoint files
print("Extracting RoBERTa Trial 7 HuggingFace checkpoint...")
!tar -xvzf /content/drive/MyDrive/Checkpoints/roberta_trial_7_checkpoint_HF.tgz

print("Extracting DeBERTa Trial 4 HuggingFace checkpoint...")
!tar -xvzf /content/drive/MyDrive/Checkpoints/deberta_trial_4_checkpoint-HF.tgz

print("All checkpoints extracted successfully!")

In [None]:

acc_metric = load("accuracy")
prec_metric = load("precision")
rec_metric = load("recall")
f1_metric = load("f1")


In [None]:
# Function to calculate size in MB
def get_model_size_mb(model):
    return sum(p.numel() for p in model.parameters()) * 4 / (1024 ** 2)  # float32 → 4 bytes

def print_model_stats(name, model):
    num_params = sum(p.numel() for p in model.parameters())
    size_mb = num_params * 4 / (1024 ** 2)  # 4 bytes per float32 param
    print(f"{name} model → Parameters: {num_params:,} | Size: {size_mb:.2f} MB")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": prec_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall":    rec_metric.compute(predictions=preds, references=labels,  average="weighted")["recall"],
        "f1":        f1_metric.compute(predictions=preds, references=labels,   average="weighted")["f1"],
    }


In [None]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.temperature = temperature
        self.alpha = alpha

        # Ensure teacher and student models are on the same device
        if self.teacher is not None and hasattr(self, 'model'):
            device = next(self.model.parameters()).device
            self.teacher = self.teacher.to(device)
            print(f"Moved teacher model to device: {device}")

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Ensure teacher model is on the same device as student model
        device = next(model.parameters()).device
        if self.teacher.device != device:
            self.teacher = self.teacher.to(device)

        # Move inputs to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}

        outputs_student = model(**inputs)
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        loss_ce = F.cross_entropy(outputs_student.logits, inputs["labels"])
        loss_kl = F.kl_div(
            F.log_softmax(outputs_student.logits / self.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.temperature, dim=1),
            reduction='batchmean' # Added missing parenthesis and reduction argument
        )
        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kl

        return (loss, outputs_student) if return_outputs else loss

print("DistillationTrainer class defined successfully!")

In [None]:
# === Starting RoBERTa PyTorch -> DistilRoBERTa Distillation ===
print("=== Starting RoBERTa PyTorch -> DistilRoBERTa Distillation ===")

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your CSV files into pandas DataFrames
train_df = pd.read_csv("data/train_df.csv")
test_df = pd.read_csv("data/test_df.csv")

# Ensure label column is int
train_df["label"] = train_df["label"].astype(int)
test_df["label"] = test_df["label"].astype(int)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load tokenizer for the student
roberta_pt_tokenizer_student = AutoTokenizer.from_pretrained("distilroberta-base")

# Tokenization function
def tokenize_roberta_pt(batch):
    return roberta_pt_tokenizer_student(batch["CleanTweet"], padding="max_length", truncation=True, max_length=256)

# Tokenize train/test datasets
tokenized_train = train_dataset.map(tokenize_roberta_pt, batched=True)
tokenized_test = test_dataset.map(tokenize_roberta_pt, batched=True)

# Set format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load student model
roberta_pt_student = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=5).to(device)

# Move teacher to device (assumes you've already loaded it as `roberta_pt_teacher`)
roberta_pt_teacher = roberta_pt_teacher.to(device)

print(f"Teacher model device: {next(roberta_pt_teacher.parameters()).device}")
print(f"Student model device: {next(roberta_pt_student.parameters()).device}")



# Training arguments
args_roberta_pt = TrainingArguments(
    output_dir="./results_roberta_pt_distill",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs_roberta_pt",
    logging_steps=10,
    save_strategy="no"
)

# Initialize the distillation trainer
trainer_roberta_pt_distill = DistillationTrainer(
    model=roberta_pt_student,
    teacher_model=roberta_pt_teacher,
    args=args_roberta_pt,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

# Train the student model
trainer_roberta_pt_distill.train()

# Print model sizes
teacher_params = sum(p.numel() for p in roberta_pt_teacher.parameters())
student_params = sum(p.numel() for p in roberta_pt_student.parameters())
print("\nRoBERTa PyTorch distillation complete!")
print(f"Teacher model parameters: {teacher_params:,} ({get_model_size_mb(roberta_pt_teacher):.2f} MB)")
print(f"Student model parameters: {student_params:,} ({get_model_size_mb(roberta_pt_student):.2f} MB)")


In [None]:
# Distillation 2: DeBERTa PyTorch Teacher -> MiniLM Student
print("=== Starting DeBERTa PyTorch -> MiniLM Distillation ===")
project = "deberta-pt-distillation-minilm"

# Student model for DeBERTa distillation
minilm_ckpt = "nreimers/MiniLM-L6-H384-uncased"
deberta_pt_student = AutoModelForSequenceClassification.from_pretrained(minilm_ckpt, num_labels=5)

# Move both models to the same device
deberta_pt_teacher = deberta_pt_teacher.to(device)
deberta_pt_student = deberta_pt_student.to(device)

# Load tokenizer for the student
deberta_pt_tokenizer_student = AutoTokenizer.from_pretrained(minilm_ckpt)

# Tokenize train and test DataFrames
tokenized_train = deberta_pt_tokenizer_student(
    list(train_df["CleanTweet"]),
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)
tokenized_test = deberta_pt_tokenizer_student(
    list(test_df["CleanTweet"]),
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

# Create Hugging Face Dataset objects and add labels
train_dataset = Dataset.from_dict({
    "input_ids": tokenized_train["input_ids"],
    "attention_mask": tokenized_train["attention_mask"],
    "labels": list(train_df["label"])
})
test_dataset = Dataset.from_dict({
    "input_ids": tokenized_test["input_ids"],
    "attention_mask": tokenized_test["attention_mask"],
    "labels": list(test_df["label"])
})

# Training arguments
args_deberta_pt = TrainingArguments(
    output_dir="./results_deberta_pt_distill_minilm",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs_deberta_pt_minilm",
    logging_steps=10,
    save_strategy="no",
    report_to="wandb",
    run_name="deberta-distill-pt-minilm"
)

# Initialize the distillation trainer
trainer_deberta_pt_distill = DistillationTrainer(
    model=deberta_pt_student,
    teacher_model=deberta_pt_teacher,
    args=args_deberta_pt,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the student model
trainer_deberta_pt_distill.train()

# Print summary of model sizes
def print_model_stats(name, model):
    num_params = sum(p.numel() for p in model.parameters())
    size_mb = num_params * 4 / (1024 ** 2)  # 4 bytes per param (float32)
    print(f"{name} model → Parameters: {num_params:,} | Size: {size_mb:.2f} MB")

print("\nDeBERTa PyTorch -> MiniLM distillation complete!")
print_model_stats("Teacher (DeBERTa PT)", deberta_pt_teacher)
print_model_stats("Student (MiniLM)", deberta_pt_student)


🔥 Distilling from DeBERTa HF (local checkpoint)

🎓 Starting knowledge distillation with teacher: DeBERTa_HF


Map: 100%|██████████| 28800/28800 [00:03<00:00, 8326.08 examples/s]
Map: 100%|██████████| 12343/12343 [00:01<00:00, 9105.13 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training student model with DeBERTa_HF teacher...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0938,0.93213,0.333873,0.304301,0.805102,0.333873
2,0.905,0.89795,0.340679,0.335706,0.787899,0.340679
3,0.8492,0.874573,0.475006,0.503715,0.813244,0.475006
4,0.8157,0.867868,0.518756,0.556934,0.809945,0.518756
5,0.7993,0.868927,0.513084,0.548842,0.808302,0.513084


Evaluating distilled model...


Results for DeBERTa_HF:
  eval_loss: 0.8689
  eval_accuracy: 0.5131
  eval_f1: 0.5488
  eval_precision: 0.8083
  eval_recall: 0.5131
  eval_runtime: 102.7847
  eval_samples_per_second: 120.0860
  eval_steps_per_second: 3.7550
💾 Results saved:
  - JSON: ./distillation_results/DeBERTa_HF_results.json
  - CSV: ./distillation_results/DeBERTa_HF_summary.csv
  - Text: ./distillation_results/DeBERTa_HF_metrics.txt
✅ Completed distillation with DeBERTa HF model


In [None]:
# Distillation 3: RoBERTa HuggingFace Teacher -> DistilRoBERTa Student
print("=== Starting RoBERTa HuggingFace -> DistilRoBERTa Distillation ===")

# Student model for RoBERTa HF distillation
roberta_hf_student = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=5)

# Move both models to the same device
roberta_hf_teacher = roberta_hf_teacher.to(device)
roberta_hf_student = roberta_hf_student.to(device)

# Tokenize dataset with DistilRoBERTa tokenizer
roberta_hf_tokenizer_student = AutoTokenizer.from_pretrained("distilroberta-base")
def tokenize_roberta_hf(batch):
    return roberta_hf_tokenizer_student(batch["CleanTweet"], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train = train_dataset.map(tokenize_roberta_hf, batched=True)
tokenized_test = test_dataset.map(tokenize_roberta_hf, batched=True)

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# Training arguments for RoBERTa HF distillation
args_roberta_hf = TrainingArguments(
    output_dir="./results_roberta_hf_distill",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs_roberta_hf",
    logging_steps=10,
    save_strategy="no"
)

# Create distillation trainer for RoBERTa HF
trainer_roberta_hf_distill = DistillationTrainer(
    model=roberta_hf_student,
    teacher_model=roberta_hf_teacher,
    args=args_roberta_hf,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)


# Train the student model
trainer_roberta_hf_distill.train()
print("\nRoBERTa HuggingFace distillation complete!")
print_model_stats("Teacher (RoBERTa HF)", roberta_hf_teacher)
print_model_stats("Student (DistilRoBERTa)", roberta_hf_student)

🔥 Distilling from: RoBERTa HF


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo


🎓 Starting knowledge distillation with teacher: RoBERTa_PT


Map: 100%|██████████| 28800/28800 [00:03<00:00, 8114.51 examples/s]
Map: 100%|██████████| 12343/12343 [00:01<00:00, 7195.86 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: Accelerator.__init__() got an unexpected keyword argument 'dispatch_batches'

In [None]:
# Distillation 4: DeBERTa HuggingFace Teacher -> MiniLM Student
print("=== Starting DeBERTa HuggingFace -> MiniLM Distillation ===")

# Student model: MiniLM
minilm_ckpt = "nreimers/MiniLM-L6-H384-uncased"
minilm_student = AutoModelForSequenceClassification.from_pretrained(minilm_ckpt, num_labels=5)

# Move both models to the same device
deberta_hf_teacher = deberta_hf_teacher.to(device)
minilm_student = minilm_student.to(device)

# Tokenizer for MiniLM
minilm_tokenizer = AutoTokenizer.from_pretrained(minilm_ckpt)

def tokenize_minilm(batch):
    return minilm_tokenizer(
        batch["CleanTweet"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# Convert pandas to HF datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset  = Dataset.from_pandas(test_df)

# Tokenize
tokenized_train = train_dataset.map(tokenize_minilm, batched=True)
tokenized_test  = test_dataset.map(tokenize_minilm, batched=True)

# Format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Training arguments
args_deberta_hf = TrainingArguments(
    output_dir="./results_deberta_hf_distill_minilm",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs_deberta_hf_minilm",
    logging_steps=10,
    save_strategy="no"
)

# Distillation trainer
trainer_deberta_hf_distill = DistillationTrainer(
    model=minilm_student,
    teacher_model=deberta_hf_teacher,
    args=args_deberta_hf,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

# Train
trainer_deberta_hf_distill.train()
print("\nDeBERTa HuggingFace -> MiniLM distillation complete!")
print_model_stats("Teacher (DeBERTa HF)", deberta_hf_teacher)
print_model_stats("Student (MiniLM)", minilm_student)


In [None]:
# Model Comparison and Summary for All 4 Distillations
print("=== Comprehensive Model Size Comparison ===")

# Calculate model sizes for all teacher-student pairs

# 1. RoBERTa PyTorch
roberta_pt_teacher_size = sum(p.numel() for p in roberta_pt_teacher.parameters())
roberta_pt_student_size = sum(p.numel() for p in roberta_pt_student.parameters())

# 2. DeBERTa PyTorch
deberta_pt_teacher_size = sum(p.numel() for p in deberta_pt_teacher.parameters())
deberta_pt_student_size = sum(p.numel() for p in deberta_pt_student.parameters())

# 3. RoBERTa HuggingFace
roberta_hf_teacher_size = sum(p.numel() for p in roberta_hf_teacher.parameters())
roberta_hf_student_size = sum(p.numel() for p in roberta_hf_student.parameters())

# 4. DeBERTa HuggingFace
deberta_hf_teacher_size = sum(p.numel() for p in deberta_hf_teacher.parameters())
deberta_hf_student_size = sum(p.numel() for p in deberta_hf_student.parameters())

print("=" * 60)
print("1. RoBERTa PyTorch Teacher -> DistilRoBERTa Student")
print(f"   Teacher size: {roberta_pt_teacher_size:,} parameters")
print(f"   Student size: {roberta_pt_student_size:,} parameters")
print(f"   Compression ratio: {roberta_pt_teacher_size/roberta_pt_student_size:.2f}x")

print("\n2. DeBERTa PyTorch Teacher -> MiniLM  Student")
print(f"   Teacher size: {deberta_pt_teacher_size:,} parameters")
print(f"   Student size: {deberta_pt_student_size:,} parameters")
print(f"   Compression ratio: {deberta_pt_teacher_size/deberta_pt_student_size:.2f}x")

print("\n3. RoBERTa HuggingFace Teacher -> DistilRoBERTa Student")
print(f"   Teacher size: {roberta_hf_teacher_size:,} parameters")
print(f"   Student size: {roberta_hf_student_size:,} parameters")
print(f"   Compression ratio: {roberta_hf_teacher_size/roberta_hf_student_size:.2f}x")

print("\n4. DeBERTa HuggingFace Teacher -> MiniLM  Student")
print(f"   Teacher size: {deberta_hf_teacher_size:,} parameters")
print(f"   Student size: {deberta_hf_student_size:,} parameters")
print(f"   Compression ratio: {deberta_hf_teacher_size/deberta_hf_student_size:.2f}x")

print("=" * 60)

# Save all distilled models
print("\n=== Saving All Distilled Models ===")

# Save RoBERTa PyTorch distilled model
roberta_pt_student.save_pretrained("./distilled_roberta_pt_student")
roberta_pt_tokenizer_student.save_pretrained("./distilled_roberta_pt_student")

# Save DeBERTa PyTorch distilled model
deberta_pt_student.save_pretrained("./distilled_deberta_pt_student")
deberta_pt_tokenizer_student.save_pretrained("./distilled_deberta_pt_student")

# Save RoBERTa HuggingFace distilled model
roberta_hf_student.save_pretrained("./distilled_roberta_hf_student")
roberta_hf_tokenizer_student.save_pretrained("./distilled_roberta_hf_student")

# Save DeBERTa HuggingFace distilled model
deberta_hf_student.save_pretrained("./distilled_deberta_hf_student")
deberta_hf_tokenizer_student.save_pretrained("./distilled_deberta_hf_student")

print("All 4 distilled models saved successfully!")

# Summary statistics
print("\n=== Summary Statistics ===")
total_teacher_params = roberta_pt_teacher_size + deberta_pt_teacher_size + roberta_hf_teacher_size + deberta_hf_teacher_size
total_student_params = roberta_pt_student_size + deberta_pt_student_size + roberta_hf_student_size + deberta_hf_student_size
overall_compression = total_teacher_params / total_student_params

print(f"Total teacher parameters: {total_teacher_params:,}")
print(f"Total student parameters: {total_student_params:,}")
print(f"Overall compression ratio: {overall_compression:.2f}x")

In [None]:
print("done")