# AI Papers RAG - Evaluation Analysis

This notebook analyzes the performance of the complete RAG system, evaluates retrieval quality, and provides insights for system optimization.

## Setup and Imports

In [None]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime, timedelta
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Add the src directory to the Python path
sys.path.append(str(Path().parent / "src"))

print("Setup complete!")
print(f"Analysis timestamp: {datetime.now()}")

## Load Previous Experiment Results

In [None]:
# Load results from previous notebooks
processed_dir = Path().parent / "data" / "processed"

# Load data exploration results
data_exploration_file = processed_dir / "data_exploration_results.json"
embedding_experiments_file = processed_dir / "embedding_experiments_results.json"

results = {}

if data_exploration_file.exists():
    with open(data_exploration_file, 'r') as f:
        results['data_exploration'] = json.load(f)
    print("✅ Loaded data exploration results")
else:
    print("⚠️ Data exploration results not found")

if embedding_experiments_file.exists():
    with open(embedding_experiments_file, 'r') as f:
        results['embedding_experiments'] = json.load(f)
    print("✅ Loaded embedding experiments results")
else:
    print("⚠️ Embedding experiments results not found")

print(f"\nLoaded {len(results)} result files")

## Generate Simulated RAG Evaluation Data

In [None]:
# Create comprehensive evaluation dataset
np.random.seed(42)

evaluation_queries = [
    {
        "query_id": "q001",
        "query": "What is the transformer architecture?",
        "category": "architecture",
        "difficulty": "basic",
        "expected_papers": ["attention_is_all_you_need", "transformer_tutorial"],
        "ground_truth_answer": "The transformer is a neural network architecture based solely on attention mechanisms."
    },
    {
        "query_id": "q002",
        "query": "How does BERT differ from GPT in training methodology?",
        "category": "comparison",
        "difficulty": "intermediate",
        "expected_papers": ["bert_paper", "gpt_paper"],
        "ground_truth_answer": "BERT uses bidirectional training while GPT uses autoregressive training."
    },
    {
        "query_id": "q003",
        "query": "What are the computational complexity implications of self-attention?",
        "category": "technical",
        "difficulty": "advanced",
        "expected_papers": ["attention_analysis", "efficiency_study"],
        "ground_truth_answer": "Self-attention has quadratic complexity with respect to sequence length."
    },
    {
        "query_id": "q004",
        "query": "What are the main applications of large language models?",
        "category": "applications",
        "difficulty": "basic",
        "expected_papers": ["llm_applications", "gpt3_paper"],
        "ground_truth_answer": "LLMs are used for text generation, translation, summarization, and question answering."
    },
    {
        "query_id": "q005",
        "query": "How do positional encodings work in transformers?",
        "category": "technical",
        "difficulty": "intermediate",
        "expected_papers": ["attention_is_all_you_need", "positional_encoding_study"],
        "ground_truth_answer": "Positional encodings provide sequence order information using sinusoidal functions."
    },
    {
        "query_id": "q006",
        "query": "What is masked language modeling?",
        "category": "concepts",
        "difficulty": "basic",
        "expected_papers": ["bert_paper", "masked_lm_study"],
        "ground_truth_answer": "Masked language modeling predicts masked tokens using bidirectional context."
    },
    {
        "query_id": "q007",
        "query": "How does multi-head attention improve model performance?",
        "category": "mechanisms",
        "difficulty": "intermediate",
        "expected_papers": ["attention_is_all_you_need", "multihead_analysis"],
        "ground_truth_answer": "Multi-head attention allows the model to attend to different representation subspaces."
    },
    {
        "query_id": "q008",
        "query": "What are the scaling laws for transformer models?",
        "category": "scaling",
        "difficulty": "advanced",
        "expected_papers": ["scaling_laws_paper", "gpt3_paper"],
        "ground_truth_answer": "Model performance scales predictably with size, data, and compute resources."
    }
]

print(f"Created evaluation dataset with {len(evaluation_queries)} queries")
print("\nQuery distribution:")
df_queries = pd.DataFrame(evaluation_queries)
print(df_queries.groupby(['category', 'difficulty']).size().unstack(fill_value=0))

## Simulate RAG System Responses

In [None]:
# Simulate RAG system responses with realistic performance characteristics
def simulate_rag_response(query_data, model_quality=0.8):
    """Simulate a RAG system response with realistic metrics"""
    
    # Response time varies by query complexity
    base_time = 1.5  # seconds
    difficulty_multiplier = {
        'basic': 1.0,
        'intermediate': 1.3,
        'advanced': 1.8
    }
    
    response_time = base_time * difficulty_multiplier[query_data['difficulty']] + np.random.normal(0, 0.3)
    response_time = max(0.5, response_time)  # Minimum 0.5 seconds
    
    # Number of retrieved documents
    num_retrieved = np.random.randint(3, 8)
    
    # Simulate retrieved documents with scores
    retrieved_docs = []
    for i in range(num_retrieved):
        # Higher scores for expected papers
        is_relevant = i < len(query_data['expected_papers']) and np.random.random() > 0.2
        
        if is_relevant:
            score = np.random.beta(8, 2)  # Skewed towards high scores
            paper_id = query_data['expected_papers'][i % len(query_data['expected_papers'])]
        else:
            score = np.random.beta(2, 5)  # Skewed towards low scores
            paper_id = f"unrelated_paper_{i}"
        
        retrieved_docs.append({
            'paper_id': paper_id,
            'score': score,
            'relevant': is_relevant
        })
    
    # Sort by score
    retrieved_docs.sort(key=lambda x: x['score'], reverse=True)
    
    # Generate answer quality metrics
    # Quality depends on retrieved document relevance and model capability
    relevant_docs_score = np.mean([doc['score'] for doc in retrieved_docs if doc['relevant']])
    if np.isnan(relevant_docs_score):
        relevant_docs_score = 0.3
    
    answer_quality = model_quality * relevant_docs_score * np.random.uniform(0.8, 1.2)
    answer_quality = np.clip(answer_quality, 0, 1)
    
    # Calculate retrieval metrics
    relevant_retrieved = sum(1 for doc in retrieved_docs if doc['relevant'])
    total_relevant = len(query_data['expected_papers'])
    
    precision = relevant_retrieved / num_retrieved if num_retrieved > 0 else 0
    recall = relevant_retrieved / total_relevant if total_relevant > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Simulate generated answer
    answer_length = np.random.randint(50, 300)  # words
    
    # Citation quality
    citation_quality = min(1.0, relevant_retrieved * 0.3)
    
    return {
        'query_id': query_data['query_id'],
        'response_time': response_time,
        'num_retrieved': num_retrieved,
        'relevant_retrieved': relevant_retrieved,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'answer_quality': answer_quality,
        'answer_length': answer_length,
        'citation_quality': citation_quality,
        'retrieved_docs': retrieved_docs,
        'top_score': max(doc['score'] for doc in retrieved_docs) if retrieved_docs else 0
    }

# Simulate responses for different model configurations
model_configurations = {
    'OpenAI + GPT-3.5': {'retrieval_quality': 0.85, 'generation_quality': 0.9},
    'SentenceBERT + GPT-3.5': {'retrieval_quality': 0.75, 'generation_quality': 0.9},
    'SentenceBERT + Local LLM': {'retrieval_quality': 0.75, 'generation_quality': 0.7}
}

all_responses = {}

for config_name, config_params in model_configurations.items():
    responses = []
    for query in evaluation_queries:
        # Combine retrieval and generation quality
        overall_quality = (config_params['retrieval_quality'] + config_params['generation_quality']) / 2
        response = simulate_rag_response(query, overall_quality)
        responses.append(response)
    
    all_responses[config_name] = responses
    print(f"✅ Generated {len(responses)} responses for {config_name}")

print(f"\nTotal simulated responses: {sum(len(responses) for responses in all_responses.values())}")

## Performance Analysis

In [None]:
# Analyze performance metrics across configurations
def analyze_configuration_performance(responses):
    """Analyze performance metrics for a configuration"""
    df = pd.DataFrame(responses)
    
    return {
        'avg_response_time': df['response_time'].mean(),
        'avg_precision': df['precision'].mean(),
        'avg_recall': df['recall'].mean(),
        'avg_f1': df['f1_score'].mean(),
        'avg_answer_quality': df['answer_quality'].mean(),
        'avg_citation_quality': df['citation_quality'].mean(),
        'avg_num_retrieved': df['num_retrieved'].mean(),
        'response_time_std': df['response_time'].std(),
        'success_rate': (df['f1_score'] > 0.5).mean()  # Queries with decent F1
    }

# Analyze each configuration
performance_summary = {}
for config_name, responses in all_responses.items():
    performance = analyze_configuration_performance(responses)
    performance_summary[config_name] = performance

# Create performance comparison DataFrame
df_performance = pd.DataFrame(performance_summary).T
print("Configuration Performance Comparison:")
print(df_performance.round(3))

In [None]:
# Visualize performance comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

configs = list(performance_summary.keys())
colors = ['blue', 'green', 'orange']

# Response Time
response_times = [performance_summary[c]['avg_response_time'] for c in configs]
axes[0, 0].bar(range(len(configs)), response_times, color=colors, alpha=0.7)
axes[0, 0].set_title('Average Response Time')
axes[0, 0].set_ylabel('Seconds')
axes[0, 0].set_xticks(range(len(configs)))
axes[0, 0].set_xticklabels([c.split('+')[0].strip() for c in configs], rotation=45)

# Precision, Recall, F1
precision_scores = [performance_summary[c]['avg_precision'] for c in configs]
recall_scores = [performance_summary[c]['avg_recall'] for c in configs]
f1_scores = [performance_summary[c]['avg_f1'] for c in configs]

x = np.arange(len(configs))
width = 0.25

axes[0, 1].bar(x - width, precision_scores, width, label='Precision', alpha=0.7)
axes[0, 1].bar(x, recall_scores, width, label='Recall', alpha=0.7)
axes[0, 1].bar(x + width, f1_scores, width, label='F1-Score', alpha=0.7)
axes[0, 1].set_title('Retrieval Performance Metrics')
axes[0, 1].set_ylabel('Score')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels([c.split('+')[0].strip() for c in configs], rotation=45)
axes[0, 1].legend()

# Answer Quality
answer_quality = [performance_summary[c]['avg_answer_quality'] for c in configs]
axes[0, 2].bar(range(len(configs)), answer_quality, color=colors, alpha=0.7)
axes[0, 2].set_title('Average Answer Quality')
axes[0, 2].set_ylabel('Quality Score')
axes[0, 2].set_xticks(range(len(configs)))
axes[0, 2].set_xticklabels([c.split('+')[0].strip() for c in configs], rotation=45)

# Success Rate
success_rates = [performance_summary[c]['success_rate'] for c in configs]
axes[1, 0].bar(range(len(configs)), success_rates, color=colors, alpha=0.7)
axes[1, 0].set_title('Success Rate (F1 > 0.5)')
axes[1, 0].set_ylabel('Success Rate')
axes[1, 0].set_xticks(range(len(configs)))
axes[1, 0].set_xticklabels([c.split('+')[0].strip() for c in configs], rotation=45)

# Response Time Distribution
for i, config in enumerate(configs):
    times = [r['response_time'] for r in all_responses[config]]
    axes[1, 1].hist(times, alpha=0.5, label=config.split('+')[0].strip(), color=colors[i], bins=10)
axes[1, 1].set_title('Response Time Distribution')
axes[1, 1].set_xlabel('Response Time (seconds)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()

# Citation Quality
citation_quality = [performance_summary[c]['avg_citation_quality'] for c in configs]
axes[1, 2].bar(range(len(configs)), citation_quality, color=colors, alpha=0.7)
axes[1, 2].set_title('Average Citation Quality')
axes[1, 2].set_ylabel('Citation Score')
axes[1, 2].set_xticks(range(len(configs)))
axes[1, 2].set_xticklabels([c.split('+')[0].strip() for c in configs], rotation=45)

plt.tight_layout()
plt.show()

## Query Difficulty Analysis

In [None]:
# Analyze performance by query difficulty and category
def analyze_by_difficulty(responses, queries):
    """Analyze performance breakdown by query difficulty"""
    df_responses = pd.DataFrame(responses)
    df_queries = pd.DataFrame(queries)
    
    # Merge responses with query metadata
    df_merged = df_responses.merge(df_queries[['query_id', 'difficulty', 'category']], on='query_id')
    
    # Group by difficulty
    difficulty_analysis = df_merged.groupby('difficulty').agg({
        'f1_score': ['mean', 'std'],
        'answer_quality': ['mean', 'std'],
        'response_time': ['mean', 'std'],
        'precision': 'mean',
        'recall': 'mean'
    }).round(3)
    
    # Group by category
    category_analysis = df_merged.groupby('category').agg({
        'f1_score': ['mean', 'std'],
        'answer_quality': ['mean', 'std'],
        'response_time': ['mean', 'std'],
        'precision': 'mean',
        'recall': 'mean'
    }).round(3)
    
    return difficulty_analysis, category_analysis, df_merged

# Analyze best performing configuration in detail
best_config = df_performance.sort_values('avg_f1', ascending=False).index[0]
print(f"Detailed analysis for best configuration: {best_config}")

difficulty_analysis, category_analysis, df_detailed = analyze_by_difficulty(
    all_responses[best_config], 
    evaluation_queries
)

print("\nPerformance by Difficulty Level:")
print(difficulty_analysis)

print("\nPerformance by Query Category:")
print(category_analysis)

In [None]:
# Create interactive heatmap for performance breakdown
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        'F1 Score by Difficulty and Category',
        'Answer Quality by Difficulty and Category', 
        'Response Time by Difficulty',
        'Performance Distribution'
    ],
    specs=[[{'type': 'heatmap'}, {'type': 'heatmap'}],
           [{'type': 'box'}, {'type': 'scatter'}]]
)

# Create pivot tables for heatmaps
f1_pivot = df_detailed.pivot_table(
    values='f1_score', 
    index='difficulty', 
    columns='category', 
    aggfunc='mean'
).fillna(0)

quality_pivot = df_detailed.pivot_table(
    values='answer_quality', 
    index='difficulty', 
    columns='category', 
    aggfunc='mean'
).fillna(0)

# F1 Score heatmap
fig.add_trace(
    go.Heatmap(
        z=f1_pivot.values,
        x=f1_pivot.columns,
        y=f1_pivot.index,
        colorscale='RdYlBu_r',
        showscale=True,
        colorbar=dict(x=0.48),
        text=f1_pivot.values.round(3),
        texttemplate="%{text}",
        textfont={"size": 12}
    ),
    row=1, col=1
)

# Answer Quality heatmap
fig.add_trace(
    go.Heatmap(
        z=quality_pivot.values,
        x=quality_pivot.columns,
        y=quality_pivot.index,
        colorscale='RdYlBu_r',
        showscale=True,
        colorbar=dict(x=1.02),
        text=quality_pivot.values.round(3),
        texttemplate="%{text}",
        textfont={"size": 12}
    ),
    row=1, col=2
)

# Response time by difficulty (box plot)
for difficulty in df_detailed['difficulty'].unique():
    difficulty_data = df_detailed[df_detailed['difficulty'] == difficulty]
    fig.add_trace(
        go.Box(
            y=difficulty_data['response_time'],
            name=difficulty.title(),
            showlegend=False
        ),
        row=2, col=1
    )

# Performance scatter plot
fig.add_trace(
    go.Scatter(
        x=df_detailed['precision'],
        y=df_detailed['recall'],
        mode='markers',
        marker=dict(
            size=df_detailed['answer_quality'] * 20,
            color=df_detailed['f1_score'],
            colorscale='RdYlBu_r',
            showscale=True,
            colorbar=dict(x=1.02, y=0.2)
        ),
        text=df_detailed['query_id'],
        hovertemplate='<b>%{text}</b><br>Precision: %{x:.3f}<br>Recall: %{y:.3f}<extra></extra>',
        showlegend=False
    ),
    row=2, col=2
)

fig.update_layout(
    title_text=f"Performance Analysis: {best_config}",
    title_x=0.5,
    height=800
)

# Update axes labels
fig.update_yaxes(title_text="Response Time (s)", row=2, col=1)
fig.update_xaxes(title_text="Precision", row=2, col=2)
fig.update_yaxes(title_text="Recall", row=2, col=2)

fig.show()

## Error Analysis and Failure Cases

In [None]:
# Identify and analyze failure cases
def identify_failure_cases(responses, queries, threshold=0.3):
    """Identify queries with poor performance"""
    df_responses = pd.DataFrame(responses)
    df_queries = pd.DataFrame(queries)
    df_merged = df_responses.merge(df_queries, on='query_id')
    
    # Define failure cases
    failures = df_merged[
        (df_merged['f1_score'] < threshold) |
        (df_merged['answer_quality'] < threshold) |
        (df_merged['precision'] < threshold)
    ]
    
    return failures

# Analyze failure patterns
failure_analysis = {}

for config_name, responses in all_responses.items():
    failures = identify_failure_cases(responses, evaluation_queries)
    
    if len(failures) > 0:
        failure_patterns = {
            'total_failures': len(failures),
            'failure_rate': len(failures) / len(responses),
            'by_difficulty': failures['difficulty'].value_counts().to_dict(),
            'by_category': failures['category'].value_counts().to_dict(),
            'avg_f1_failures': failures['f1_score'].mean(),
            'avg_quality_failures': failures['answer_quality'].mean(),
            'common_issues': []
        }
        
        # Identify common failure patterns
        low_precision = failures[failures['precision'] < 0.3]
        low_recall = failures[failures['recall'] < 0.3]
        low_quality = failures[failures['answer_quality'] < 0.3]
        
        if len(low_precision) > 0:
            failure_patterns['common_issues'].append(f"Low precision ({len(low_precision)} queries)")
        if len(low_recall) > 0:
            failure_patterns['common_issues'].append(f"Low recall ({len(low_recall)} queries)")
        if len(low_quality) > 0:
            failure_patterns['common_issues'].append(f"Low answer quality ({len(low_quality)} queries)")
        
        failure_analysis[config_name] = failure_patterns
    
    print(f"\n{config_name} - Failure Analysis:")
    print(f"  Total failures: {len(failures)} / {len(responses)} ({len(failures)/len(responses)*100:.1f}%)")
    
    if len(failures) > 0:
        print(f"  Failure by difficulty: {failures['difficulty'].value_counts().to_dict()}")
        print(f"  Failure by category: {failures['category'].value_counts().to_dict()}")
        print(f"  Failed queries: {failures['query_id'].tolist()}")

In [None]:
# Visualize failure analysis
if failure_analysis:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Failure rates by configuration
    configs = list(failure_analysis.keys())
    failure_rates = [failure_analysis[c]['failure_rate'] * 100 for c in configs]
    
    axes[0, 0].bar(range(len(configs)), failure_rates, alpha=0.7, color='red')
    axes[0, 0].set_title('Failure Rate by Configuration')
    axes[0, 0].set_ylabel('Failure Rate (%)')
    axes[0, 0].set_xticks(range(len(configs)))
    axes[0, 0].set_xticklabels([c.split('+')[0].strip() for c in configs], rotation=45)
    
    # Failure distribution by difficulty (best config)
    if configs:
        best_config_failures = failure_analysis[configs[0]]
        if best_config_failures['by_difficulty']:
            difficulties = list(best_config_failures['by_difficulty'].keys())
            counts = list(best_config_failures['by_difficulty'].values())
            axes[0, 1].pie(counts, labels=difficulties, autopct='%1.1f%%')
            axes[0, 1].set_title('Failure Distribution by Difficulty')
    
    # Performance comparison: success vs failure cases
    best_config_name = list(all_responses.keys())[0]
    all_responses_df = pd.DataFrame(all_responses[best_config_name])
    failures_df = identify_failure_cases(all_responses[best_config_name], evaluation_queries)
    successes_df = all_responses_df[~all_responses_df['query_id'].isin(failures_df['query_id'])]
    
    metrics = ['precision', 'recall', 'f1_score', 'answer_quality']
    success_means = [successes_df[m].mean() for m in metrics]
    failure_means = [failures_df[m].mean() for m in metrics] if len(failures_df) > 0 else [0] * len(metrics)
    
    x = np.arange(len(metrics))
    width = 0.35
    
    axes[1, 0].bar(x - width/2, success_means, width, label='Success Cases', alpha=0.7, color='green')
    axes[1, 0].bar(x + width/2, failure_means, width, label='Failure Cases', alpha=0.7, color='red')
    axes[1, 0].set_title('Success vs Failure Cases')
    axes[1, 0].set_ylabel('Average Score')
    axes[1, 0].set_xticks(x)
    axes[1, 0].set_xticklabels(metrics, rotation=45)
    axes[1, 0].legend()
    
    # Response time distribution for success vs failure
    if len(successes_df) > 0:
        axes[1, 1].hist(successes_df['response_time'], alpha=0.5, label='Success', color='green', bins=8)
    if len(failures_df) > 0:
        axes[1, 1].hist(failures_df['response_time'], alpha=0.5, label='Failure', color='red', bins=8)
    axes[1, 1].set_title('Response Time: Success vs Failure')
    axes[1, 1].set_xlabel('Response Time (seconds)')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("No significant failures detected across configurations")

## Optimization Recommendations

In [None]:
# Generate optimization recommendations based on analysis
def generate_recommendations(performance_summary, failure_analysis):
    """Generate optimization recommendations based on evaluation results"""
    
    # Find best and worst performing configurations
    df_perf = pd.DataFrame(performance_summary).T
    best_config = df_perf.sort_values('avg_f1', ascending=False).index[0]
    worst_config = df_perf.sort_values('avg_f1', ascending=True).index[0]
    
    recommendations = {
        'model_selection': {
            'best_overall': best_config,
            'best_speed': df_perf.sort_values('avg_response_time', ascending=True).index[0],
            'best_quality': df_perf.sort_values('avg_answer_quality', ascending=False).index[0]
        },
        'performance_improvements': [],
        'cost_optimizations': [],
        'quality_enhancements': [],
        'technical_recommendations': []
    }
    
    # Performance improvements
    avg_response_time = df_perf['avg_response_time'].mean()
    if avg_response_time > 3.0:
        recommendations['performance_improvements'].append(
            "Consider caching embeddings and implementing batch processing to reduce response times"
        )
    
    if df_perf['avg_recall'].mean() < 0.6:
        recommendations['performance_improvements'].append(
            "Increase retrieval K value or improve chunking strategy to enhance recall"
        )
    
    if df_perf['avg_precision'].mean() < 0.7:
        recommendations['performance_improvements'].append(
            "Fine-tune similarity thresholds or implement re-ranking to improve precision"
        )
    
    # Cost optimizations
    if 'OpenAI' in best_config:
        recommendations['cost_optimizations'].append(
            "Consider hybrid approach: use local embeddings for development and OpenAI for production"
        )
    
    recommendations['cost_optimizations'].extend([
        "Implement embedding caching to reduce API calls",
        "Use batch processing for multiple queries",
        "Monitor token usage and implement usage-based optimization"
    ])
    
    # Quality enhancements
    if failure_analysis:
        common_failure_patterns = []
        for config, analysis in failure_analysis.items():
            if analysis['failure_rate'] > 0.2:
                common_failure_patterns.extend(analysis.get('common_issues', []))
        
        if 'Low precision' in str(common_failure_patterns):
            recommendations['quality_enhancements'].append(
                "Implement semantic re-ranking or filtering to improve precision"
            )
        
        if 'Low recall' in str(common_failure_patterns):
            recommendations['quality_enhancements'].append(
                "Expand query processing with synonyms and related terms"
            )
    
    recommendations['quality_enhancements'].extend([
        "Implement query expansion and reformulation",
        "Add metadata filtering for domain-specific queries",
        "Use ensemble retrieval combining multiple strategies"
    ])
    
    # Technical recommendations
    recommendations['technical_recommendations'].extend([
        "Set up A/B testing framework for model comparisons",
        "Implement real-time monitoring for performance metrics",
        "Create evaluation pipeline with human feedback integration",
        "Establish baseline metrics and regression testing",
        "Deploy gradual rollout strategy for model updates"
    ])
    
    return recommendations

# Generate recommendations
recommendations = generate_recommendations(performance_summary, failure_analysis)

print("🎯 RAG System Optimization Recommendations")
print("=" * 60)

print("\n🏆 MODEL SELECTION:")
print(f"  Best Overall: {recommendations['model_selection']['best_overall']}")
print(f"  Best Speed: {recommendations['model_selection']['best_speed']}")
print(f"  Best Quality: {recommendations['model_selection']['best_quality']}")

print("\n⚡ PERFORMANCE IMPROVEMENTS:")
for i, rec in enumerate(recommendations['performance_improvements'], 1):
    print(f"  {i}. {rec}")

print("\n💰 COST OPTIMIZATIONS:")
for i, rec in enumerate(recommendations['cost_optimizations'], 1):
    print(f"  {i}. {rec}")

print("\n🔧 QUALITY ENHANCEMENTS:")
for i, rec in enumerate(recommendations['quality_enhancements'], 1):
    print(f"  {i}. {rec}")

print("\n🛠️ TECHNICAL RECOMMENDATIONS:")
for i, rec in enumerate(recommendations['technical_recommendations'], 1):
    print(f"  {i}. {rec}")

## Implementation Roadmap

In [None]:
# Create implementation roadmap
roadmap = {
    "Phase 1: Foundation (Week 1-2)": [
        "Implement best-performing configuration from evaluation",
        "Set up embedding caching system",
        "Create basic monitoring and logging",
        "Deploy initial version with selected model"
    ],
    "Phase 2: Optimization (Week 3-4)": [
        "Implement query processing improvements",
        "Add retrieval re-ranking mechanism",
        "Optimize chunking strategy based on evaluation",
        "Set up A/B testing framework"
    ],
    "Phase 3: Enhancement (Week 5-6)": [
        "Add metadata filtering and faceted search",
        "Implement ensemble retrieval methods",
        "Create user feedback collection system",
        "Develop evaluation automation"
    ],
    "Phase 4: Scale & Monitor (Week 7-8)": [
        "Implement production monitoring dashboard",
        "Set up automated performance regression testing",
        "Create cost monitoring and alerting",
        "Establish continuous evaluation pipeline"
    ]
}

print("🗺️ RAG System Implementation Roadmap")
print("=" * 50)

for phase, tasks in roadmap.items():
    print(f"\n{phase}:")
    for i, task in enumerate(tasks, 1):
        print(f"  {i}. {task}")

# Create success metrics
success_metrics = {
    "Performance Targets": {
        "Average F1 Score": "> 0.7",
        "Average Response Time": "< 2.0 seconds",
        "Success Rate": "> 85%",
        "User Satisfaction": "> 4.0/5.0"
    },
    "Quality Targets": {
        "Answer Relevance": "> 0.8",
        "Citation Accuracy": "> 0.9",
        "Factual Correctness": "> 0.85",
        "Coherence Score": "> 0.8"
    },
    "Operational Targets": {
        "System Uptime": "> 99.5%",
        "Error Rate": "< 1%",
        "Cost per Query": "< $0.05",
        "Monthly Active Users": "> 1000"
    }
}

print("\n\n📊 Success Metrics & Targets")
print("=" * 35)

for category, metrics in success_metrics.items():
    print(f"\n{category}:")
    for metric, target in metrics.items():
        print(f"  • {metric}: {target}")

In [None]:
# Save comprehensive evaluation results
final_evaluation_results = {
    'timestamp': datetime.now().isoformat(),
    'evaluation_summary': {
        'total_queries': len(evaluation_queries),
        'configurations_tested': len(model_configurations),
        'best_configuration': recommendations['model_selection']['best_overall']
    },
    'performance_metrics': performance_summary,
    'failure_analysis': failure_analysis,
    'recommendations': recommendations,
    'implementation_roadmap': roadmap,
    'success_metrics': success_metrics,
    'detailed_results': {
        'evaluation_queries': evaluation_queries,
        'all_responses': all_responses
    }
}

# Save results
output_file = processed_dir / 'evaluation_analysis_results.json'
with open(output_file, 'w') as f:
    json.dump(final_evaluation_results, f, indent=2, default=str)

print(f"✅ Comprehensive evaluation results saved to {output_file}")

# Create summary report
summary_report = f"""
RAG System Evaluation Summary Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*50}

EVALUATION OVERVIEW:
• Queries Tested: {len(evaluation_queries)}
• Configurations: {len(model_configurations)}
• Best Overall: {recommendations['model_selection']['best_overall']}

KEY FINDINGS:
• Average F1 Score: {df_performance['avg_f1'].max():.3f}
• Average Response Time: {df_performance['avg_response_time'].min():.2f}s
• Success Rate: {df_performance['success_rate'].max():.1%}

TOP RECOMMENDATIONS:
1. Deploy {recommendations['model_selection']['best_overall']} as primary configuration
2. Implement embedding caching for cost optimization
3. Set up A/B testing framework for continuous improvement
4. Create monitoring dashboard for production deployment

NEXT STEPS:
1. Review implementation roadmap
2. Begin Phase 1 deployment
3. Set up monitoring and evaluation pipeline
4. Collect user feedback for continuous improvement
"""

print(summary_report)

# Save summary report
with open(processed_dir / 'evaluation_summary_report.txt', 'w') as f:
    f.write(summary_report)

print("\n🎉 Evaluation analysis completed successfully!")
print("📋 Summary report saved for stakeholder review.")
print("🚀 Ready for production deployment planning.")