In [4]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
class ModelEvaluator:
    def __init__(self, model_path, dataset_name="ag_news"):
        """
        Initialize the evaluator with model path and dataset
        
        Args:
            model_path (str): Path to your saved model directory
            dataset_name (str): Dataset name (default: "ag_news")
        """
        self.model_path = model_path
        self.dataset_name = dataset_name
        
        # Load model and tokenizer
        print("Loading model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        
        # Load dataset
        print("Loading dataset...")
        self.dataset = load_dataset(dataset_name)
        
        # AG News class labels
        self.class_names = ["World", "Sports", "Business", "Sci/Tech"]
        
    def preprocess_data(self, examples):
        """Tokenize the input text"""
        return self.tokenizer(
            examples["text"], 
            truncation=True, 
            padding=True,
            max_length=128
        )
    
    def compute_metrics(self, eval_pred):
        """Compute evaluation metrics"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average='weighted'
        )
        accuracy = accuracy_score(labels, predictions)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def evaluate_model(self):
        """Perform comprehensive model evaluation"""
        
        # Prepare test dataset
        test_dataset = self.dataset["test"].map(
            self.preprocess_data, 
            batched=True
        )
        
        # Set up trainer for evaluation
        training_args = TrainingArguments(
            output_dir="./eval_output",
            per_device_eval_batch_size=32,
            dataloader_drop_last=False,
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
        )
        
        # Run evaluation
        print("Running evaluation...")
        eval_results = trainer.evaluate(test_dataset)
        
        # Get predictions for detailed analysis
        predictions = trainer.predict(test_dataset)
        y_pred = np.argmax(predictions.predictions, axis=1)
        y_true = predictions.label_ids
        
        return eval_results, y_pred, y_true
    
    def detailed_analysis(self, y_true, y_pred):
        """Perform detailed analysis with confusion matrix and per-class metrics"""
        
        # Confusion Matrix
        cm = confusion_matrix(y_true, y_pred)
        
        # Per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            y_true, y_pred, average=None
        )
        
        # Create results dataframe
        results_df = pd.DataFrame({
            'Class': self.class_names,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Support': support
        })
        
        return cm, results_df

    
    def run_full_evaluation(self):
        """Run complete evaluation pipeline"""
        
        print("Starting Model Evaluation...")
        print("=" * 50)
        
        # Main evaluation
        eval_results, y_pred, y_true = self.evaluate_model()
        
        # Print main metrics
        print(f"\n=== Overall Performance ===")
        print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
        print(f"F1-Score: {eval_results['eval_f1']:.4f}")
        print(f"Precision: {eval_results['eval_precision']:.4f}")
        print(f"Recall: {eval_results['eval_recall']:.4f}")
        
        # Detailed analysis
        cm, results_df = self.detailed_analysis(y_true, y_pred)
        
        print(f"\n=== Per-Class Performance ===")
        print(results_df.round(4))
        
        
        return eval_results, results_df

# Usage Example
if __name__ == "__main__":
    # Replace with your actual model path
    MODEL_PATH = "/Users/arsalsyed/distilbert-ag-news"  # path to trained model
    
    # Initialize evaluator
    evaluator = ModelEvaluator(MODEL_PATH)
    
    # Run full evaluation
    results, per_class_results = evaluator.run_full_evaluation()

Loading model and tokenizer...
Loading dataset...
Starting Model Evaluation...


Map: 100%|██████████████████████████████| 7600/7600 [00:00<00:00, 25793.22 examples/s]
  trainer = Trainer(


Running evaluation...







=== Overall Performance ===
Accuracy: 0.9447
F1-Score: 0.9448
Precision: 0.9449
Recall: 0.9447

=== Per-Class Performance ===
      Class  Precision  Recall  F1-Score  Support
0     World     0.9617  0.9516    0.9566     1900
1    Sports     0.9874  0.9863    0.9868     1900
2  Business     0.9212  0.9111    0.9161     1900
3  Sci/Tech     0.9094  0.9300    0.9196     1900
