# FIT-FLIX RAG System Evaluation

This notebook evaluates the performance of the FIT-FLIX RAG system.

## Contents
1. System Setup and Initialization
2. Test Questions and Ground Truth
3. Retrieval Evaluation
4. Generation Quality Assessment
5. End-to-End Performance
6. Performance Metrics and Analysis

In [None]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any
import time

# Add src to path
sys.path.append('../src')

from src.config import Config
from src.retrieval.retriever import DocumentRetriever
from src.generation.llm_manager import LLMManager
from src.utils.document_loader import DocumentLoader
from src.embeddings.embedding_manager import EmbeddingManager
from src.utils.text_splitter import TextSplitter

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

In [None]:
# Initialize the RAG system
config = Config()
document_loader = DocumentLoader(config)
text_splitter = TextSplitter(config)
retriever = DocumentRetriever(config)
llm_manager = LLMManager(config)

print("🚀 Initializing FIT-FLIX RAG System for evaluation...")

# Load and process documents if needed
try:
    retriever.initialize()
    collection_info = retriever.get_retrieval_stats()
    print(f"✅ Vector store loaded with {collection_info.get('document_count', 0)} documents")
except Exception as e:
    print(f"📚 Loading documents into vector store...")
    documents = document_loader.load_all_documents()
    chunked_docs = text_splitter.split_documents(documents)
    
    # Add documents to retriever
    retriever.initialize()
    doc_contents = [doc['content'] for doc in chunked_docs]
    doc_metadata = [doc['metadata'] for doc in chunked_docs]
    retriever.add_documents(doc_contents, doc_metadata)
    print(f"✅ Added {len(chunked_docs)} document chunks to vector store")

In [None]:
# Define test questions with expected categories
test_questions = [
    {
        "question": "What types of fitness classes do you offer?",
        "expected_category": "classes",
        "difficulty": "easy"
    },
    {
        "question": "How much does a monthly membership cost?",
        "expected_category": "membership",
        "difficulty": "easy"
    },
    {
        "question": "What are the gym's operating hours?",
        "expected_category": "about",
        "difficulty": "easy"
    },
    {
        "question": "Can you recommend a good post-workout nutrition plan?",
        "expected_category": "nutrition",
        "difficulty": "medium"
    },
    {
        "question": "What qualifications do your personal trainers have?",
        "expected_category": "trainers",
        "difficulty": "medium"
    },
    {
        "question": "How can I cancel my membership?",
        "expected_category": "membership",
        "difficulty": "medium"
    },
    {
        "question": "Do you have specialized equipment for powerlifting?",
        "expected_category": "facilities",
        "difficulty": "hard"
    },
    {
        "question": "What should I eat before a high-intensity workout?",
        "expected_category": "nutrition",
        "difficulty": "hard"
    }
]

print(f"📝 Created {len(test_questions)} test questions")
for i, q in enumerate(test_questions, 1):
    print(f"{i}. [{q['difficulty'].upper()}] {q['question']} (Expected: {q['expected_category']})")

In [None]:
# Evaluate retrieval performance
def evaluate_retrieval(questions: List[Dict], retriever: DocumentRetriever, top_k: int = 5):
    """Evaluate retrieval performance."""
    results = []
    
    for q_data in questions:
        question = q_data['question']
        expected_category = q_data['expected_category']
        difficulty = q_data['difficulty']
        
        start_time = time.time()
        retrieved_docs = retriever.retrieve(question, n_results=top_k)
        retrieval_time = time.time() - start_time
        
        # Check if expected category is in retrieved docs
        retrieved_categories = [doc['metadata'].get('category', 'unknown') for doc in retrieved_docs]
        category_match = expected_category in retrieved_categories
        
        # Calculate average similarity
        avg_similarity = np.mean([doc['similarity'] for doc in retrieved_docs]) if retrieved_docs else 0
        
        results.append({
            'question': question,
            'expected_category': expected_category,
            'difficulty': difficulty,
            'retrieved_count': len(retrieved_docs),
            'category_match': category_match,
            'avg_similarity': avg_similarity,
            'top_similarity': retrieved_docs[0]['similarity'] if retrieved_docs else 0,
            'retrieval_time': retrieval_time,
            'retrieved_categories': retrieved_categories[:3]  # Top 3 categories
        })
    
    return results

print("🔍 Evaluating retrieval performance...")
retrieval_results = evaluate_retrieval(test_questions, retriever)
retrieval_df = pd.DataFrame(retrieval_results)

print("\n📊 Retrieval Results Summary:")
print(f"Category Match Rate: {retrieval_df['category_match'].mean():.2%}")
print(f"Average Similarity Score: {retrieval_df['avg_similarity'].mean():.3f}")
print(f"Average Retrieval Time: {retrieval_df['retrieval_time'].mean():.3f}s")

print("\nDetailed Results:")
for _, row in retrieval_df.iterrows():
    match_status = "✅" if row['category_match'] else "❌"
    print(f"{match_status} {row['question'][:50]}... | Sim: {row['top_similarity']:.3f} | Time: {row['retrieval_time']:.3f}s")

In [None]:
# Evaluate generation performance
def evaluate_generation(questions: List[Dict], retriever: DocumentRetriever, llm_manager: LLMManager):
    """Evaluate generation performance."""
    results = []
    
    for q_data in questions:
        question = q_data['question']
        difficulty = q_data['difficulty']
        
        try:
            # Retrieve context
            retrieved_docs = retriever.retrieve(question)
            
            # Generate response
            start_time = time.time()
            response = llm_manager.generate_response(question, retrieved_docs)
            generation_time = time.time() - start_time
            
            # Basic quality metrics
            response_length = len(response)
            word_count = len(response.split())
            has_context = len(retrieved_docs) > 0
            
            results.append({
                'question': question,
                'difficulty': difficulty,
                'response': response,
                'response_length': response_length,
                'word_count': word_count,
                'generation_time': generation_time,
                'has_context': has_context,
                'context_docs': len(retrieved_docs),
                'success': True
            })
            
        except Exception as e:
            results.append({
                'question': question,
                'difficulty': difficulty,
                'response': f"Error: {str(e)}",
                'response_length': 0,
                'word_count': 0,
                'generation_time': 0,
                'has_context': False,
                'context_docs': 0,
                'success': False
            })
    
    return results

# Note: This will only work if you have valid API keys configured
print("🤖 Evaluating generation performance...")
print("Note: This requires valid API keys in your .env file")

try:
    generation_results = evaluate_generation(test_questions[:3], retriever, llm_manager)  # Test first 3 questions
    generation_df = pd.DataFrame(generation_results)
    
    print("\n📊 Generation Results Summary:")
    print(f"Success Rate: {generation_df['success'].mean():.2%}")
    print(f"Average Response Length: {generation_df['response_length'].mean():.0f} characters")
    print(f"Average Word Count: {generation_df['word_count'].mean():.0f} words")
    print(f"Average Generation Time: {generation_df['generation_time'].mean():.3f}s")
    
    print("\nSample Responses:")
    for _, row in generation_df.iterrows():
        if row['success']:
            print(f"\nQ: {row['question']}")
            print(f"A: {row['response'][:200]}..." if len(row['response']) > 200 else f"A: {row['response']}")
        else:
            print(f"\nQ: {row['question']}")
            print(f"❌ Failed: {row['response']}")
            
except Exception as e:
    print(f"⚠️  Generation evaluation skipped: {str(e)}")
    print("Make sure you have valid API keys configured in your .env file")
    generation_df = pd.DataFrame()  # Empty dataframe for plotting

In [None]:
# Visualization of results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Retrieval performance by difficulty
if not retrieval_df.empty:
    difficulty_performance = retrieval_df.groupby('difficulty')['category_match'].mean()
    axes[0, 0].bar(difficulty_performance.index, difficulty_performance.values, color=['green', 'orange', 'red'])
    axes[0, 0].set_title('Retrieval Accuracy by Difficulty')
    axes[0, 0].set_ylabel('Category Match Rate')
    axes[0, 0].set_ylim(0, 1)

# Similarity score distribution
if not retrieval_df.empty:
    axes[0, 1].hist(retrieval_df['avg_similarity'], bins=10, alpha=0.7, color='skyblue')
    axes[0, 1].set_title('Distribution of Similarity Scores')
    axes[0, 1].set_xlabel('Average Similarity Score')
    axes[0, 1].set_ylabel('Frequency')

# Performance timing
if not retrieval_df.empty:
    axes[1, 0].scatter(retrieval_df['retrieval_time'], retrieval_df['avg_similarity'], 
                      c=['green' if match else 'red' for match in retrieval_df['category_match']], alpha=0.7)
    axes[1, 0].set_title('Retrieval Time vs Similarity')
    axes[1, 0].set_xlabel('Retrieval Time (seconds)')
    axes[1, 0].set_ylabel('Average Similarity')

# Generation performance (if available)
if not generation_df.empty and len(generation_df) > 0:
    axes[1, 1].scatter(generation_df['generation_time'], generation_df['word_count'], 
                      c=['green' if success else 'red' for success in generation_df['success']], alpha=0.7)
    axes[1, 1].set_title('Generation Time vs Response Length')
    axes[1, 1].set_xlabel('Generation Time (seconds)')
    axes[1, 1].set_ylabel('Word Count')
else:
    axes[1, 1].text(0.5, 0.5, 'Generation evaluation\nnot available\n(API keys required)', 
                   ha='center', va='center', transform=axes[1, 1].transAxes, fontsize=12)
    axes[1, 1].set_title('Generation Performance')

plt.tight_layout()
plt.show()

In [None]:
# Performance summary and recommendations
print("=== FIT-FLIX RAG SYSTEM EVALUATION SUMMARY ===")
print(f"\n📊 RETRIEVAL PERFORMANCE:")
print(f"• Category Match Rate: {retrieval_df['category_match'].mean():.1%}")
print(f"• Average Similarity Score: {retrieval_df['avg_similarity'].mean():.3f}")
print(f"• Average Retrieval Time: {retrieval_df['retrieval_time'].mean():.3f} seconds")
print(f"• Best performing difficulty: {retrieval_df.groupby('difficulty')['category_match'].mean().idxmax()}")
print(f"• Worst performing difficulty: {retrieval_df.groupby('difficulty')['category_match'].mean().idxmin()}")

if not generation_df.empty and len(generation_df) > 0:
    print(f"\n🤖 GENERATION PERFORMANCE:")
    print(f"• Success Rate: {generation_df['success'].mean():.1%}")
    print(f"• Average Response Length: {generation_df['response_length'].mean():.0f} characters")
    print(f"• Average Generation Time: {generation_df['generation_time'].mean():.3f} seconds")
else:
    print(f"\n🤖 GENERATION PERFORMANCE: Not evaluated (requires API keys)")

print(f"\n💡 RECOMMENDATIONS:")
if retrieval_df['category_match'].mean() < 0.8:
    print("• Consider improving document chunking strategy")
    print("• Review embedding model performance")
    print("• Add more diverse training examples")

if retrieval_df['avg_similarity'].mean() < 0.7:
    print("• Consider fine-tuning embedding model on domain data")
    print("• Implement hybrid search (semantic + keyword)")

if retrieval_df['retrieval_time'].mean() > 1.0:
    print("• Optimize vector database indexing")
    print("• Consider reducing embedding dimensions")

print("• Implement user feedback collection for continuous improvement")
print("• Add more comprehensive evaluation metrics")
print("• Consider A/B testing different retrieval strategies")