# LLM Evaluation and Visualization

This notebook provides comprehensive tools for evaluating and visualizing LLM performance.

## Topics Covered:
- Model evaluation metrics
- Performance visualization
- Model comparison
- Advanced evaluation techniques
- Report generation

In [None]:
# Import required libraries
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Dict, Any, Optional
import time
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print('Libraries imported successfully!')

## 1. Evaluation Framework Setup

Create a comprehensive evaluation framework for LLMs.

In [None]:
class LLMEvaluator:
    """Comprehensive LLM evaluation framework"""
    
    def __init__(self, model_name: str, device: str = 'auto'):
        self.model_name = model_name
        self.device = self._get_device(device)
        self.tokenizer = None
        self.model = None
        self.evaluation_results = {}
    
    def _get_device(self, device: str) -> str:
        if device == 'auto':
            return 'cuda' if torch.cuda.is_available() else 'cpu'
        return device
    
    def load_model(self):
        """Load model and tokenizer"""
        print(f'Loading model: {self.model_name}')
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16 if self.device == 'cuda' else torch.float32,
            device_map='auto' if self.device == 'cuda' else None
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print(f'Model loaded on {self.device}')
    
    def calculate_perplexity(self, texts: List[str]) -> float:
        """Calculate perplexity on a list of texts"""
        if not self.model or not self.tokenizer:
            raise ValueError('Model not loaded. Call load_model() first.')
        
        total_loss = 0
        total_tokens = 0
        
        self.model.eval()
        with torch.no_grad():
            for text in texts:
                inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                outputs = self.model(**inputs, labels=inputs['input_ids'])
                loss = outputs.loss
                
                total_loss += loss.item() * inputs['input_ids'].size(1)
                total_tokens += inputs['input_ids'].size(1)
        
        avg_loss = total_loss / total_tokens
        perplexity = torch.exp(torch.tensor(avg_loss)).item()
        
        return perplexity
    
    def generate_text(self, prompts: List[str], max_length: int = 100, 
                     num_return_sequences: int = 1) -> List[str]:
        """Generate text from prompts"""
        if not self.model or not self.tokenizer:
            raise ValueError('Model not loaded. Call load_model() first.')
        
        generations = []
        
        self.model.eval()
        for prompt in prompts:
            inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_length=max_length,
                    num_return_sequences=num_return_sequences,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            for output in outputs:
                generated_text = self.tokenizer.decode(output, skip_special_tokens=True)
                # Remove the original prompt from the generated text
                generated_text = generated_text[len(prompt):].strip()
                generations.append(generated_text)
        
        return generations
    
    def measure_inference_speed(self, prompts: List[str]) -> Dict[str, float]:
        """Measure inference speed metrics"""
        if not self.model or not self.tokenizer:
            raise ValueError('Model not loaded. Call load_model() first.')
        
        times = []
        token_counts = []
        
        self.model.eval()
        for prompt in prompts:
            inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            start_time = time.time()
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_length=100,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            end_time = time.time()
            
            inference_time = end_time - start_time
            token_count = outputs[0].size(0)
            
            times.append(inference_time)
            token_counts.append(token_count)
        
        return {
            'avg_time_per_sample': np.mean(times),
            'avg_tokens_per_second': np.mean(token_counts) / np.mean(times),
            'total_time': sum(times),
            'total_tokens': sum(token_counts)
        }

print('LLMEvaluator class defined successfully!')

## 2. Sample Data and Model Setup

Set up sample data and load a model for evaluation.

In [None]:
# Sample evaluation data
sample_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning is transforming how we process data.",
    "Natural language processing enables computers to understand text.",
    "Deep learning models require large amounts of training data.",
    "Artificial intelligence will shape the future of technology."
]

sample_prompts = [
    "Explain machine learning in simple terms:",
    "What is the future of AI?",
    "How does natural language processing work?",
    "Describe the benefits of deep learning:"
]

# Initialize evaluator with a smaller model for demonstration
evaluator = LLMEvaluator('gpt2')

print('Sample data prepared!')
print(f'Sample texts: {len(sample_texts)}')
print(f'Sample prompts: {len(sample_prompts)}')

In [None]:
# Load the model
evaluator.load_model()

## 3. Model Evaluation

Perform comprehensive model evaluation.

In [None]:
# Calculate perplexity
print('Calculating perplexity...')
perplexity = evaluator.calculate_perplexity(sample_texts)
print(f'Perplexity: {perplexity:.2f}')

# Generate text samples
print('\nGenerating text samples...')
generations = evaluator.generate_text(sample_prompts, max_length=80)

for i, (prompt, generation) in enumerate(zip(sample_prompts, generations)):
    print(f'\nPrompt {i+1}: {prompt}')
    print(f'Generation: {generation}')

# Measure inference speed
print('\nMeasuring inference speed...')
speed_metrics = evaluator.measure_inference_speed(sample_prompts[:2])  # Use fewer samples for speed

for metric, value in speed_metrics.items():
    print(f'{metric}: {value:.3f}')

## 4. Visualization Dashboard

Create interactive visualizations for model performance.

In [None]:
class EvaluationVisualizer:
    """Create visualizations for model evaluation results"""
    
    def __init__(self, figsize: tuple = (12, 8)):
        self.figsize = figsize
        plt.rcParams['figure.figsize'] = figsize
    
    def plot_metrics_overview(self, metrics: Dict[str, float], title: str = 'Model Performance Metrics'):
        """Plot overview of key metrics"""
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=self.figsize)
        fig.suptitle(title, fontsize=16, fontweight='bold')
        
        # Perplexity gauge
        perplexity = metrics.get('perplexity', 0)
        ax1.bar(['Perplexity'], [perplexity], color='skyblue')
        ax1.set_title('Perplexity (Lower is Better)')
        ax1.set_ylabel('Score')
        
        # Speed metrics
        speed_data = {
            'Tokens/sec': metrics.get('avg_tokens_per_second', 0),
            'Time/sample': metrics.get('avg_time_per_sample', 0)
        }
        ax2.bar(speed_data.keys(), speed_data.values(), color=['lightcoral', 'lightgreen'])
        ax2.set_title('Inference Speed')
        ax2.set_ylabel('Value')
        
        # Token distribution
        token_counts = [len(text.split()) for text in sample_texts]
        ax3.hist(token_counts, bins=10, color='gold', alpha=0.7)
        ax3.set_title('Token Count Distribution')
        ax3.set_xlabel('Token Count')
        ax3.set_ylabel('Frequency')
        
        # Performance radar (placeholder)
        categories = ['Speed', 'Quality', 'Efficiency', 'Consistency']
        values = [0.8, 0.7, 0.9, 0.6]  # Example values
        
        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False)
        values += values[:1]  # Complete the circle
        angles = np.concatenate((angles, [angles[0]]))
        
        ax4 = plt.subplot(2, 2, 4, projection='polar')
        ax4.plot(angles, values, 'o-', linewidth=2, color='purple')
        ax4.fill(angles, values, alpha=0.25, color='purple')
        ax4.set_xticks(angles[:-1])
        ax4.set_xticklabels(categories)
        ax4.set_title('Performance Radar')
        
        plt.tight_layout()
        plt.show()
    
    def plot_generation_analysis(self, prompts: List[str], generations: List[str]):
        """Analyze generated text characteristics"""
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
        
        # Generation length distribution
        gen_lengths = [len(gen.split()) for gen in generations]
        ax1.hist(gen_lengths, bins=10, color='lightblue', alpha=0.7)
        ax1.set_title('Generation Length Distribution')
        ax1.set_xlabel('Word Count')
        ax1.set_ylabel('Frequency')
        
        # Prompt vs Generation length
        prompt_lengths = [len(prompt.split()) for prompt in prompts]
        ax2.scatter(prompt_lengths, gen_lengths, color='red', alpha=0.6)
        ax2.set_title('Prompt vs Generation Length')
        ax2.set_xlabel('Prompt Length (words)')
        ax2.set_ylabel('Generation Length (words)')
        
        # Average word length in generations
        avg_word_lengths = [np.mean([len(word) for word in gen.split()]) for gen in generations]
        ax3.bar(range(len(avg_word_lengths)), avg_word_lengths, color='green', alpha=0.7)
        ax3.set_title('Average Word Length per Generation')
        ax3.set_xlabel('Generation Index')
        ax3.set_ylabel('Average Word Length')
        
        plt.tight_layout()
        plt.show()
    
    def plot_training_dynamics(self, epochs: List[int], train_loss: List[float], 
                              val_loss: List[float] = None):
        """Plot training dynamics (for fine-tuning scenarios)"""
        plt.figure(figsize=(10, 6))
        
        plt.plot(epochs, train_loss, 'b-', label='Training Loss', linewidth=2)
        
        if val_loss:
            plt.plot(epochs, val_loss, 'r-', label='Validation Loss', linewidth=2)
        
        plt.title('Training Dynamics', fontsize=14, fontweight='bold')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

# Create visualizer and generate plots
visualizer = EvaluationVisualizer()

# Prepare metrics for visualization
metrics_for_viz = {
    'perplexity': perplexity,
    **speed_metrics
}

# Generate visualizations
visualizer.plot_metrics_overview(metrics_for_viz, 'GPT-2 Model Performance')
visualizer.plot_generation_analysis(sample_prompts, generations)

# Example training dynamics
example_epochs = list(range(1, 11))
example_train_loss = [2.5 - 0.2*i + 0.1*np.random.randn() for i in example_epochs]
example_val_loss = [2.7 - 0.15*i + 0.15*np.random.randn() for i in example_epochs]

visualizer.plot_training_dynamics(example_epochs, example_train_loss, example_val_loss)

## 5. Model Comparison Framework

Compare multiple models side by side.

In [None]:
class ModelComparator:
    """Compare multiple models across various metrics"""
    
    def __init__(self):
        self.models = {}
        self.results = {}
    
    def add_model_results(self, model_name: str, results: Dict[str, Any]):
        """Add evaluation results for a model"""
        self.results[model_name] = results
    
    def compare_metrics(self, metrics: List[str]) -> pd.DataFrame:
        """Create comparison dataframe for specified metrics"""
        comparison_data = {}
        
        for model_name, results in self.results.items():
            comparison_data[model_name] = [results.get(metric, 0) for metric in metrics]
        
        df = pd.DataFrame(comparison_data, index=metrics)
        return df
    
    def plot_comparison(self, metrics: List[str], title: str = 'Model Comparison'):
        """Plot model comparison"""
        df = self.compare_metrics(metrics)
        
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Bar plot
        df.plot(kind='bar', ax=axes[0], width=0.8)
        axes[0].set_title(f'{title} - Bar Chart')
        axes[0].set_ylabel('Score')
        axes[0].legend(title='Models')
        axes[0].tick_params(axis='x', rotation=45)
        
        # Heatmap
        sns.heatmap(df, annot=True, fmt='.3f', cmap='RdYlBu_r', ax=axes[1])
        axes[1].set_title(f'{title} - Heatmap')
        
        plt.tight_layout()
        plt.show()
        
        return df
    
    def rank_models(self, metrics: List[str], weights: List[float] = None) -> pd.DataFrame:
        """Rank models based on weighted metrics"""
        df = self.compare_metrics(metrics)
        
        if weights is None:
            weights = [1.0] * len(metrics)
        
        # Normalize metrics (assuming higher is better, except for perplexity)
        normalized_df = df.copy()
        
        for metric in metrics:
            if 'perplexity' in metric.lower() or 'loss' in metric.lower():
                # Lower is better - invert
                normalized_df.loc[metric] = 1 / (df.loc[metric] + 1e-6)
            
            # Min-max normalization
            min_val = normalized_df.loc[metric].min()
            max_val = normalized_df.loc[metric].max()
            if max_val > min_val:
                normalized_df.loc[metric] = (normalized_df.loc[metric] - min_val) / (max_val - min_val)
        
        # Calculate weighted scores
        weighted_scores = {}
        for model in df.columns:
            score = sum(normalized_df.loc[metric, model] * weight 
                       for metric, weight in zip(metrics, weights))
            weighted_scores[model] = score / sum(weights)
        
        # Create ranking dataframe
        ranking_df = pd.DataFrame(list(weighted_scores.items()), 
                                 columns=['Model', 'Weighted Score'])
        ranking_df = ranking_df.sort_values('Weighted Score', ascending=False)
        ranking_df['Rank'] = range(1, len(ranking_df) + 1)
        
        return ranking_df[['Rank', 'Model', 'Weighted Score']]

# Example model comparison
comparator = ModelComparator()

# Add results for multiple models (simulated)
comparator.add_model_results('GPT-2', {
    'perplexity': perplexity,
    'avg_tokens_per_second': speed_metrics['avg_tokens_per_second'],
    'avg_time_per_sample': speed_metrics['avg_time_per_sample']
})

# Simulated results for other models
comparator.add_model_results('GPT-2-Medium', {
    'perplexity': perplexity * 0.8,
    'avg_tokens_per_second': speed_metrics['avg_tokens_per_second'] * 0.7,
    'avg_time_per_sample': speed_metrics['avg_time_per_sample'] * 1.3
})

comparator.add_model_results('DistilGPT-2', {
    'perplexity': perplexity * 1.2,
    'avg_tokens_per_second': speed_metrics['avg_tokens_per_second'] * 1.5,
    'avg_time_per_sample': speed_metrics['avg_time_per_sample'] * 0.6
})

# Compare models
metrics_to_compare = ['perplexity', 'avg_tokens_per_second', 'avg_time_per_sample']
comparison_df = comparator.plot_comparison(metrics_to_compare, 'GPT Model Comparison')

print('\nComparison DataFrame:')
print(comparison_df)

# Rank models
ranking = comparator.rank_models(metrics_to_compare, weights=[0.4, 0.3, 0.3])
print('\nModel Ranking:')
print(ranking)

## 6. Advanced Evaluation Metrics

Implement advanced metrics for comprehensive evaluation.

In [None]:
class AdvancedMetrics:
    """Advanced evaluation metrics for LLMs"""
    
    @staticmethod
    def calculate_bleu_score(reference: str, candidate: str, n: int = 4) -> float:
        """Calculate BLEU score (simplified implementation)"""
        ref_words = reference.lower().split()
        cand_words = candidate.lower().split()
        
        if len(cand_words) == 0:
            return 0.0
        
        # Calculate n-gram precision
        precisions = []
        
        for i in range(1, min(n + 1, len(cand_words) + 1)):
            ref_ngrams = [tuple(ref_words[j:j+i]) for j in range(len(ref_words) - i + 1)]
            cand_ngrams = [tuple(cand_words[j:j+i]) for j in range(len(cand_words) - i + 1)]
            
            if len(cand_ngrams) == 0:
                precisions.append(0.0)
                continue
            
            matches = sum(1 for ngram in cand_ngrams if ngram in ref_ngrams)
            precision = matches / len(cand_ngrams)
            precisions.append(precision)
        
        if not precisions or all(p == 0 for p in precisions):
            return 0.0
        
        # Geometric mean of precisions
        bleu = np.exp(np.mean([np.log(p) if p > 0 else -float('inf') for p in precisions]))
        
        # Brevity penalty
        bp = min(1.0, np.exp(1 - len(ref_words) / len(cand_words)))
        
        return bp * bleu
    
    @staticmethod
    def calculate_rouge_l(reference: str, candidate: str) -> float:
        """Simple ROUGE-L score implementation"""
        ref_words = reference.lower().split()
        cand_words = candidate.lower().split()
        
        if not ref_words or not cand_words:
            return 0.0
        
        # Find longest common subsequence
        def lcs_length(x, y):
            m, n = len(x), len(y)
            dp = [[0] * (n + 1) for _ in range(m + 1)]
            
            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i-1] == y[j-1]:
                        dp[i][j] = dp[i-1][j-1] + 1
                    else:
                        dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            
            return dp[m][n]
        
        lcs_len = lcs_length(ref_words, cand_words)
        
        # Calculate F1 score
        precision = lcs_len / len(cand_words)
        recall = lcs_len / len(ref_words)
        
        if precision + recall == 0:
            return 0.0
        
        f1 = 2 * precision * recall / (precision + recall)
        return f1
    
    @staticmethod
    def calculate_semantic_similarity(text1: str, text2: str) -> float:
        """Simple semantic similarity based on word overlap"""
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = words1.intersection(words2)
        union = words1.union(words2)
        
        return len(intersection) / len(union)
    
    @staticmethod
    def calculate_readability_score(text: str) -> float:
        """Simple readability score based on sentence and word length"""
        sentences = text.split('.')
        sentences = [s.strip() for s in sentences if s.strip()]
        
        if not sentences:
            return 0.0
        
        words = text.split()
        
        avg_sentence_length = len(words) / len(sentences)
        avg_word_length = sum(len(word) for word in words) / len(words)
        
        # Simple readability formula (lower is more readable)
        readability = 206.835 - 1.015 * avg_sentence_length - 84.6 * (avg_word_length / 4.7)
        
        # Normalize to 0-1 range
        return max(0, min(1, readability / 100))

# Test advanced metrics
print('Testing advanced evaluation metrics...')

# Sample texts for testing
reference_text = "Machine learning is a powerful tool for data analysis and prediction."
candidate_text = "Machine learning provides powerful tools for analyzing data and making predictions."

metrics = AdvancedMetrics()

bleu = metrics.calculate_bleu_score(reference_text, candidate_text)
rouge = metrics.calculate_rouge_l(reference_text, candidate_text)
similarity = metrics.calculate_semantic_similarity(reference_text, candidate_text)
readability = metrics.calculate_readability_score(candidate_text)

print(f'BLEU Score: {bleu:.3f}')
print(f'ROUGE-L Score: {rouge:.3f}')
print(f'Semantic Similarity: {similarity:.3f}')
print(f'Readability Score: {readability:.3f}')

## 7. Evaluation Report Generation

Generate comprehensive evaluation reports.

In [None]:
class EvaluationReporter:
    """Generate comprehensive evaluation reports"""
    
    def __init__(self, model_name: str, results: Dict[str, Any]):
        self.model_name = model_name
        self.results = results
        self.timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    
    def generate_html_report(self) -> str:
        """Generate HTML evaluation report"""
        html = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Model Evaluation Report - {self.model_name}</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 40px; }}
                .header {{ background-color: #f0f0f0; padding: 20px; border-radius: 5px; }}
                .metric {{ margin: 10px 0; padding: 10px; background-color: #f9f9f9; border-left: 4px solid #007acc; }}
                .generation {{ margin: 10px 0; padding: 15px; background-color: #fff; border: 1px solid #ddd; border-radius: 5px; }}
                .prompt {{ font-weight: bold; color: #333; }}
                .response {{ margin-top: 10px; color: #666; }}
                table {{ border-collapse: collapse; width: 100%; }}
                th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
                th {{ background-color: #f2f2f2; }}
            </style>
        </head>
        <body>
            <div class="header">
                <h1>Model Evaluation Report</h1>
                <p><strong>Model:</strong> {self.model_name}</p>
                <p><strong>Evaluation Date:</strong> {self.timestamp}</p>
            </div>
            
            <h2>Performance Metrics</h2>
        """
        
        # Add metrics
        for metric, value in self.results.items():
            if metric not in ['generations', 'retrieved_docs', 'context_used']:
                if isinstance(value, float):
                    html += f'<div class="metric"><strong>{metric.replace("_", " ").title()}:</strong> {value:.3f}</div>\n'
                else:
                    html += f'<div class="metric"><strong>{metric.replace("_", " ").title()}:</strong> {value}</div>\n'
        
        # Add generations if available
        if 'generations' in self.results:
            html += "<h2>Sample Generations</h2>\n"
            for i, generation in enumerate(self.results['generations'][:5], 1):
                html += f"""
                <div class="generation">
                    <div class="prompt">Generation {i}:</div>
                    <div class="response">{generation}</div>
                </div>
                """
        
        html += """
        </body>
        </html>
        """
        
        return html
    
    def generate_markdown_report(self) -> str:
        """Generate Markdown evaluation report"""
        report = f"""# Model Evaluation Report

**Model:** {self.model_name}
**Evaluation Date:** {self.timestamp}

## Performance Metrics

"""
        
        # Add metrics table
        report += "| Metric | Value |\n|--------|-------|\n"
        
        for metric, value in self.results.items():
            if metric not in ['generations', 'retrieved_docs', 'context_used']:
                metric_name = metric.replace('_', ' ').title()
                if isinstance(value, float):
                    report += f"| {metric_name} | {value:.3f} |\n"
                else:
                    report += f"| {metric_name} | {value} |\n"
        
        # Add generations
        if 'generations' in self.results:
            report += "\n## Sample Generations\n\n"
            for i, generation in enumerate(self.results['generations'][:3], 1):
                report += f"### Generation {i}\n\n{generation}\n\n"
        
        return report
    
    def save_report(self, format_type: str = 'markdown', output_dir: str = '../logs') -> str:
        """Save evaluation report to file"""
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        timestamp_str = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
        
        if format_type == 'html':
            filename = f"evaluation_report_{self.model_name}_{timestamp_str}.html"
            content = self.generate_html_report()
        else:
            filename = f"evaluation_report_{self.model_name}_{timestamp_str}.md"
            content = self.generate_markdown_report()
        
        file_path = output_path / filename
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
        
        return str(file_path)

# Generate and save evaluation reports
print('Generating evaluation reports...')

# Create sample results for demonstration
sample_results = {
    'perplexity': 15.2,
    'bleu_score': 0.65,
    'rouge_l': 0.72,
    'semantic_similarity': 0.78,
    'response_time_ms': 245,
    'total_tokens': 1250,
    'generations': [
        "Machine learning enables computers to learn patterns from data without explicit programming.",
        "Deep learning uses neural networks with multiple layers to process complex information.",
        "Natural language processing helps computers understand and generate human language."
    ]
}

# Create reporter and generate reports
reporter = EvaluationReporter("gpt2-medium", sample_results)

# Generate and save markdown report
md_path = reporter.save_report('markdown')
print(f'Markdown report saved to: {md_path}')

# Generate and save HTML report
html_path = reporter.save_report('html')
print(f'HTML report saved to: {html_path}')

print('\nEvaluation and visualization tutorial completed!')
print('You now have tools for:')
print('- Model evaluation with multiple metrics')
print('- Interactive visualization dashboards')
print('- Model comparison frameworks')
print('- Advanced evaluation metrics')
print('- Comprehensive report generation')