In [1]:
# Import all necessary packages for production simulation
import asyncio
import os
import json
import re
import logging
import hashlib
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime, timedelta
import textwrap
from collections import defaultdict

import pandas as pd
import numpy as np
import faiss
from dotenv import load_dotenv

# ML and NLP libraries
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter

# LangChain components (production versions)
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnableLambda
from langchain_core.retrievers import BaseRetriever
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from pydantic import BaseModel, Field, SecretStr, ConfigDict

# Our components
from src.data_loading.faiss_loader import load_faiss_index

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment and setup
load_dotenv()
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

print("✅ Production-Grade RAG Evaluation Setup Complete!")




✅ Production-Grade RAG Evaluation Setup Complete!


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Production CustomRetriever simulation

class ProductionCustomRetriever(BaseRetriever, BaseModel):
    """Production-accurate CustomRetriever with caching and async operations."""
    
    alpha: float = 0.7  # Production default
    embeddings: GoogleGenerativeAIEmbeddings = Field(...)
    faiss_index: faiss.Index = Field(...)
    id_mapping: dict = Field(...)
    k: int = 20  # Production default
    
    model_config = ConfigDict(arbitrary_types_allowed=True)
    
    async def _get_cached_embedding(self, query: str) -> np.ndarray:
        """Get embedding with caching - production simulation."""
        return await cache_embedding_query(query, self.embeddings)
    
    async def _aget_relevant_documents_async(self, query: str) -> List[Document]:
        """Production-accurate async document retrieval."""
        logger.debug(f"Starting document retrieval for query: {query[:50]}...")
        
        try:
            # Get cached embedding
            query_embedding = await self._get_cached_embedding(query)
            query_vector = np.array(query_embedding).reshape(1, -1)
            
            # Search FAISS index
            logger.debug("Searching FAISS index...")
            distances, indices = self.faiss_index.search(query_vector, k=self.k)
            
            # Map FAISS indices to chunk_ids
            retrieved_chunk_ids = []
            for idx in indices[0]:
                if idx in self.id_mapping:
                    retrieved_chunk_ids.append(self.id_mapping[idx])
            
            logger.debug(f"Found {len(retrieved_chunk_ids)} chunks from FAISS")
            
            # Fetch documents from "database" (async)
            docs_from_db = await notebook_db.fetch_chunks_by_ids(list(set(retrieved_chunk_ids)))
            
            if not docs_from_db:
                logger.warning("No documents found in DB for retrieved IDs")
                return []
            
            # Process documents with production scoring
            documents = []
            for distance, idx in zip(distances[0], indices[0]):
                if idx in self.id_mapping:
                    chunk_id = self.id_mapping[idx]
                    doc_data = docs_from_db.get(chunk_id)
                    
                    if doc_data:
                        metadata = doc_data.copy()
                        text = metadata.pop("text", "")
                        
                        # Production scoring algorithm
                        relevance_score = 1.0 / (1.0 + distance)
                        
                        # Simulate similarity score (in production this comes from pgvector)
                        # For notebook, we'll compute it directly
                        text_embedding = await self._get_cached_embedding(text[:500])  # Truncate for efficiency
                        text_vector = np.array(text_embedding).reshape(1, -1)
                        similarity_score = cosine_similarity(query_vector, text_vector)[0][0]
                        
                        # Production final score calculation
                        final_score = (self.alpha * similarity_score + 
                                     (1 - self.alpha) * relevance_score)
                        
                        metadata.update({
                            "relevancia": round(relevance_score, 3),
                            "similarity_score": round(similarity_score, 3),
                            "final_score": round(final_score, 4),
                        })
                        
                        documents.append(Document(page_content=text, metadata=metadata))
            
            # Sort by final score (production behavior)
            documents.sort(key=lambda d: d.metadata.get("final_score", 0), reverse=True)
            logger.debug(f"Retrieval completed, returning {len(documents)} documents")
            return documents
            
        except Exception as e:
            logger.error(f"Error during document retrieval: {e}", exc_info=True)
            raise
    
    def _get_relevant_documents(self, query: str, *, run_manager: Optional[Any] = None) -> list:
        """Required by BaseRetriever - do not use directly."""
        raise NotImplementedError("Use async _aget_relevant_documents method only!")
    
    async def _aget_relevant_documents(self, query: str, *, run_manager: Optional[Any] = None) -> list:
        """Async interface for LCEL pipeline compatibility."""
        return await self._aget_relevant_documents_async(query)

print("🔍 Production CustomRetriever ready!")
import asyncio
import os
import json
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple
import textwrap
from dataclasses import dataclass
from collections import defaultdict

import pandas as pd
import numpy as np
from dotenv import load_dotenv

# ML and NLP libraries
from sklearn.metrics.pairwise import cosine_similarity

# Current architecture components
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from pydantic import SecretStr

# Our components
from src.data_loading.faiss_loader import load_faiss_index

# Load environment and setup
load_dotenv()
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

print("✅ RAG Quality Evaluation Setup Complete!")


🔍 Production CustomRetriever ready!
✅ RAG Quality Evaluation Setup Complete!


In [3]:
# Define evaluation data structures
@dataclass
class GroundTruthExample:
    """Single test case with ground truth data."""
    question: str
    expected_answer_key_points: List[str]
    relevant_doc_ids: List[str]  # Documents that should be retrieved
    legal_domain: str  # e.g., "büntetőjog", "polgári jog"
    difficulty: str  # "easy", "medium", "hard"
    
@dataclass
class RetrievalResult:
    """Results from document retrieval."""
    query: str
    retrieved_docs: List[Document]
    distances: List[float]
    
@dataclass
class QAResult:
    """Complete QA pipeline result."""
    question: str
    answer: str
    retrieval_result: RetrievalResult
    ground_truth: GroundTruthExample

# Test cases for Hungarian legal domain (questions remain in Hungarian)
GROUND_TRUTH_CASES = [
    GroundTruthExample(
        question="Mi a bűnszervezet fogalma a Btk. szerint?",
        expected_answer_key_points=[
            "három vagy több személy",
            "hosszabb időre szervezett",
            "összehangoltan működő csoport",
            "ötévi vagy ezt meghaladó szabadságvesztés",
            "Btk. 459. §"
        ],
        relevant_doc_ids=["Bf.*", "P.*"],  # Regex patterns for relevant docs
        legal_domain="büntetőjog",
        difficulty="medium"
    ),
    GroundTruthExample(
        question="Milyen feltételei vannak a bűnszervezetben való részvételnek?",
        expected_answer_key_points=[
            "bűnszervezet",
            "részvétel",
            "tag",
            "közreműködés",
            "bűncselekmény"
        ],
        relevant_doc_ids=["Bf.*", "B.*", "Kb.*"],
        legal_domain="büntetőjog", 
        difficulty="hard"
    ),
    GroundTruthExample(
        question="Mi a különbség az alperes és a felperes között?",
        expected_answer_key_points=[
            "felperes: per indítója",
            "alperes: per ellen akivel indítják", 
            "polgári per",
            "peres felek"
        ],
        relevant_doc_ids=["Pf.*", "P.*"],
        legal_domain="polgári jog",
        difficulty="easy"
    ),
    GroundTruthExample(
        question="Mikor alkalmazható a feltételes szabadság?",
        expected_answer_key_points=[
            "feltételes",
            "szabadság",
            "végrehajtás",
            "időtartam",
            "feltétel"
        ],
        relevant_doc_ids=["Bf.*", "B.*", "Fkf.*"],
        legal_domain="büntetőjog",
        difficulty="medium"
    ),
    GroundTruthExample(
        question="Mit jelent a bizonyítási teher a polgári perben?",
        expected_answer_key_points=[
            "bizonyítási",
            "teher",
            "polgári",
            "per",
            "bizonyíték"
        ],
        relevant_doc_ids=["Pf.*", "P.*"],
        legal_domain="polgári jog",
        difficulty="hard"
    )
]

print(f"📋 Loaded {len(GROUND_TRUTH_CASES)} ground truth test cases")
for case in GROUND_TRUTH_CASES:
    print(f"  🔹 {case.legal_domain} - {case.difficulty}: {case.question[:50]}...")


📋 Loaded 5 ground truth test cases
  🔹 büntetőjog - medium: Mi a bűnszervezet fogalma a Btk. szerint?...
  🔹 büntetőjog - hard: Milyen feltételei vannak a bűnszervezetben való ré...
  🔹 polgári jog - easy: Mi a különbség az alperes és a felperes között?...
  🔹 büntetőjog - medium: Mikor alkalmazható a feltételes szabadság?...
  🔹 polgári jog - hard: Mit jelent a bizonyítási teher a polgári perben?...


In [4]:
# Load data and initialize models
print("🔧 Loading sample data and models...")

# Load sample data
sample_parquet_path = "data/processed/sample_data.parquet"
faiss_index_path = "data/processed/sample_faiss.bin" 
id_mapping_path = "data/processed/sample_mapping.pkl"

df = pd.read_parquet(sample_parquet_path)
faiss_index, id_mapping = load_faiss_index(faiss_index_path, id_mapping_path)

print(f"✅ Data loaded: {len(df)} docs, {faiss_index.ntotal} vectors")

# Initialize models
google_api_key = os.getenv('GOOGLE_API_KEY')
if not google_api_key:
    raise ValueError("GOOGLE_API_KEY environment variable is required!")

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", 
    api_key=google_api_key
)

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0,
    api_key=SecretStr(google_api_key),
)

# Load prompt
prompt_path = Path("src/prompts/legal_assistant_prompt.txt")
template = prompt_path.read_text(encoding="utf-8")
prompt = PromptTemplate.from_template(template)

print("🤖 Models and prompts initialized!")


🔧 Loading sample data and models...
✅ Data loaded: 8293 docs, 8293 vectors
🤖 Models and prompts initialized!


In [5]:
# Evaluation metrics and helper functions
class RAGEvaluator:
    """Comprehensive RAG evaluation class with hybrid search."""
    
    def __init__(self, embeddings, llm, prompt, df, faiss_index, id_mapping):
        self.embeddings = embeddings
        self.llm = llm  
        self.prompt = prompt
        self.df = df
        self.faiss_index = faiss_index
        self.id_mapping = id_mapping
        
    def keyword_search(self, query: str, k: int = 10) -> List[Document]:
        """Keyword-based search in documents."""
        # Extract key terms from query
        key_terms = []
        if 'bűnszervezet' in query.lower():
            key_terms.extend(['bűnszervezet', 'szervezett', 'csoport'])
        if 'alperes' in query.lower() or 'felperes' in query.lower():
            key_terms.extend(['alperes', 'felperes', 'per', 'polgári'])
        if 'feltételes' in query.lower():
            key_terms.extend(['feltételes', 'szabadság', 'végrehajtás'])
        if 'bizonyítási' in query.lower():
            key_terms.extend(['bizonyítási', 'teher', 'bizonyíték'])
        
        # Search for documents containing these terms
        matching_docs = []
        for term in key_terms:
            matches = self.df[self.df['text_chunk'].str.contains(term, case=False, na=False)]
            for _, row in matches.head(k//len(key_terms) + 1).iterrows():
                doc = Document(
                    page_content=row['text_chunk'],
                    metadata={
                        'chunk_id': row['chunk_id'],
                        'doc_id': row['doc_id'],
                        'search_type': 'keyword',
                        'matched_term': term
                    }
                )
                if doc not in matching_docs:
                    matching_docs.append(doc)
        
        return matching_docs[:k]
        
    def retrieve_documents(self, query: str, k: int = 5) -> RetrievalResult:
        """Hybrid retrieval: semantic + keyword search."""
        # 1. Semantic search (FAISS)
        query_embedding = self.embeddings.embed_query(query)
        query_vector = np.array([query_embedding], dtype='float32')
        distances, indices = self.faiss_index.search(query_vector, k*2)  # Get more for filtering
        
        semantic_docs = []
        for i, idx in enumerate(indices[0]):
            if idx in self.id_mapping:
                chunk_id = self.id_mapping[idx]
                row = self.df[self.df['chunk_id'] == chunk_id]
                if not row.empty:
                    text_content = row.iloc[0]['text_chunk']
                    doc_id = row.iloc[0]['doc_id']
                    
                    semantic_docs.append(Document(
                        page_content=text_content,
                        metadata={
                            'chunk_id': chunk_id, 
                            'doc_id': doc_id,
                            'distance': float(distances[0][i]),
                            'search_type': 'semantic'
                        }
                    ))
        
        # 2. Keyword search
        keyword_docs = self.keyword_search(query, k)
        
        # 3. Merge and deduplicate by doc_id
        all_docs = {}
        for doc in semantic_docs + keyword_docs:
            doc_id = doc.metadata['doc_id']
            if doc_id not in all_docs:
                all_docs[doc_id] = doc
        
        # 4. Rank by relevance (prefer keyword matches, then semantic similarity)
        final_docs = []
        for doc_id, doc in all_docs.items():
            if doc.metadata.get('search_type') == 'keyword':
                final_docs.insert(0, doc)  # Keyword matches first
            else:
                final_docs.append(doc)  # Semantic matches after
        
        return RetrievalResult(
            query=query,
            retrieved_docs=final_docs[:k],
            distances=[doc.metadata.get('distance', 0.5) for doc in final_docs[:k]]
        )
    
    def generate_answer(self, retrieval_result: RetrievalResult) -> str:
        """Generate answer from retrieved documents."""
        # Format context
        context_lines = []
        for doc in retrieval_result.retrieved_docs:
            doc_id = doc.metadata.get("doc_id", "N/A")
            content = doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content
            context_lines.append(f"### Document ID: {doc_id}\nContent:\n{content}")
        
        context = "\n\n".join(context_lines)
        
        # Generate answer
        formatted_input = self.prompt.format(context=context, question=retrieval_result.query)
        result = self.llm.invoke(formatted_input)
        return result.content if hasattr(result, 'content') else str(result)
    
    def evaluate_retrieval(self, retrieval_result: RetrievalResult, ground_truth: GroundTruthExample) -> Dict[str, float]:
        """Evaluate retrieval quality."""
        retrieved_doc_ids = [doc.metadata['doc_id'] for doc in retrieval_result.retrieved_docs]
        
        # Check which expected docs were found
        relevant_found = 0
        total_relevant = len(ground_truth.relevant_doc_ids)
        
        for pattern in ground_truth.relevant_doc_ids:
            # Use regex matching for doc IDs
            for doc_id in retrieved_doc_ids:
                if re.match(pattern, doc_id):
                    relevant_found += 1
                    break
        
        # Calculate metrics
        precision_at_k = relevant_found / len(retrieved_doc_ids) if retrieved_doc_ids else 0
        recall_at_k = relevant_found / total_relevant if total_relevant > 0 else 0
        f1_at_k = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k) if (precision_at_k + recall_at_k) > 0 else 0
        
        # Mean Reciprocal Rank (MRR)
        mrr = 0
        for i, doc_id in enumerate(retrieved_doc_ids):
            for pattern in ground_truth.relevant_doc_ids:
                if re.match(pattern, doc_id):
                    mrr = 1 / (i + 1)
                    break
            if mrr > 0:
                break
                
        return {
            'precision_at_k': precision_at_k,
            'recall_at_k': recall_at_k,
            'f1_at_k': f1_at_k,
            'mrr': mrr,
            'relevant_found': relevant_found,
            'total_relevant': total_relevant,
            'retrieved_count': len(retrieved_doc_ids)
        }

# Initialize evaluator
evaluator = RAGEvaluator(embeddings, llm, prompt, df, faiss_index, id_mapping)
print("📊 RAG Evaluator initialized!")


📊 RAG Evaluator initialized!


In [6]:
# Answer quality evaluation functions
def evaluate_answer_quality(answer: str, ground_truth: GroundTruthExample) -> Dict[str, Any]:
    """Evaluate the quality of generated answer."""
    
    # 1. Key points coverage
    answer_lower = answer.lower()
    key_points_found = []
    for point in ground_truth.expected_answer_key_points:
        if point.lower() in answer_lower:
            key_points_found.append(point)
    
    key_points_coverage = len(key_points_found) / len(ground_truth.expected_answer_key_points)
    
    # 2. Legal terminology accuracy
    legal_terms = [
        'btk', 'bűnszervezet', 'szabadságvesztés', 'felperes', 'alperes', 
        'bíróság', 'ítélet', 'határozat', 'törvény', 'jog', 'per', 'vádlott'
    ]
    
    legal_terms_used = []
    for term in legal_terms:
        if term in answer_lower:
            legal_terms_used.append(term)
    
    # 3. Structure and formatting check
    has_structured_response = any(marker in answer for marker in [
        "1. Szintetizált Válasz", "2. Részletes Elemzés", "3. Konklúzió", "4. Jogi nyilatkozat"
    ])
    
    # 4. Citation check  
    citation_pattern = r'\(Forrás: [^)]+\)'
    citations = re.findall(citation_pattern, answer)
    has_citations = len(citations) > 0
    
    # 5. Length and completeness
    word_count = len(answer.split())
    is_adequate_length = 50 <= word_count <= 500  # Reasonable answer length
    
    # 6. Hungarian language quality (basic check)
    hungarian_indicators = ['szerint', 'alapján', 'amely', 'amelynek', 'illetve', 'továbbá']
    hungarian_score = sum(1 for indicator in hungarian_indicators if indicator in answer_lower) / len(hungarian_indicators)
    
    return {
        'key_points_coverage': key_points_coverage,
        'key_points_found': key_points_found,
        'legal_terms_count': len(legal_terms_used),
        'legal_terms_used': legal_terms_used,
        'has_structured_response': has_structured_response,
        'has_citations': has_citations,
        'citation_count': len(citations),
        'word_count': word_count,
        'is_adequate_length': is_adequate_length,
        'hungarian_quality_score': hungarian_score
    }

def calculate_semantic_similarity(answer: str, expected_points: List[str], embeddings) -> float:
    """Calculate semantic similarity between answer and expected points."""
    try:
        # Get embedding for the answer
        answer_embedding = embeddings.embed_query(answer)
        
        # Get embeddings for expected points
        expected_text = " ".join(expected_points)
        expected_embedding = embeddings.embed_query(expected_text)
        
        # Calculate cosine similarity
        similarity = cosine_similarity(
            np.array(answer_embedding).reshape(1, -1),
            np.array(expected_embedding).reshape(1, -1)
        )[0][0]
        
        return float(similarity)
    except Exception as e:
        print(f"Error calculating semantic similarity: {e}")
        return 0.0

print("📊 Answer quality evaluation functions ready!")


📊 Answer quality evaluation functions ready!


In [7]:
# Run comprehensive evaluation
print("🧪 Running comprehensive RAG evaluation...")
print("=" * 70)

evaluation_results = []

for i, ground_truth in enumerate(GROUND_TRUTH_CASES, 1):
    print(f"\n📝 Test Case {i}/{len(GROUND_TRUTH_CASES)}")
    print(f"❓ Question: {ground_truth.question}")
    print(f"🏷️ Domain: {ground_truth.legal_domain} | Difficulty: {ground_truth.difficulty}")
    
    try:
        # Step 1: Retrieve documents
        print("  🔍 Retrieving documents...")
        retrieval_result = evaluator.retrieve_documents(ground_truth.question, k=5)
        
        # Step 2: Generate answer
        print("  🤖 Generating answer...")
        answer = evaluator.generate_answer(retrieval_result)
        
        # Step 3: Evaluate retrieval
        print("  📊 Evaluating retrieval...")
        retrieval_metrics = evaluator.evaluate_retrieval(retrieval_result, ground_truth)
        
        # Step 4: Evaluate answer quality
        print("  📋 Evaluating answer quality...")
        answer_metrics = evaluate_answer_quality(answer, ground_truth)
        
        # Step 5: Calculate semantic similarity
        print("  🔗 Calculating semantic similarity...")
        semantic_sim = calculate_semantic_similarity(
            answer, ground_truth.expected_answer_key_points, embeddings
        )
        
        # Compile results
        result = {
            'test_case': i,
            'question': ground_truth.question,
            'domain': ground_truth.legal_domain,
            'difficulty': ground_truth.difficulty,
            'answer': answer,
            'retrieval_metrics': retrieval_metrics,
            'answer_metrics': answer_metrics,
            'semantic_similarity': semantic_sim,
            'retrieved_docs': [doc.metadata['doc_id'] for doc in retrieval_result.retrieved_docs]
        }
        
        evaluation_results.append(result)
        
        # Print quick summary
        print(f"  ✅ Results:")
        print(f"     🎯 Precision@5: {retrieval_metrics['precision_at_k']:.3f}")
        print(f"     📝 Key Points Coverage: {answer_metrics['key_points_coverage']:.3f}")
        print(f"     🔗 Semantic Similarity: {semantic_sim:.3f}")
        
    except Exception as e:
        print(f"  ❌ Error: {e}")
        continue
    
    print("-" * 50)

print(f"\n✅ Evaluation completed! {len(evaluation_results)} test cases processed.")


🧪 Running comprehensive RAG evaluation...

📝 Test Case 1/5
❓ Question: Mi a bűnszervezet fogalma a Btk. szerint?
🏷️ Domain: büntetőjog | Difficulty: medium
  🔍 Retrieving documents...
  🤖 Generating answer...
  📊 Evaluating retrieval...
  📋 Evaluating answer quality...
  🔗 Calculating semantic similarity...
  ✅ Results:
     🎯 Precision@5: 0.200
     📝 Key Points Coverage: 0.000
     🔗 Semantic Similarity: 0.591
--------------------------------------------------

📝 Test Case 2/5
❓ Question: Milyen feltételei vannak a bűnszervezetben való részvételnek?
🏷️ Domain: büntetőjog | Difficulty: hard
  🔍 Retrieving documents...
  🤖 Generating answer...
  📊 Evaluating retrieval...
  📋 Evaluating answer quality...
  🔗 Calculating semantic similarity...
  ✅ Results:
     🎯 Precision@5: 0.400
     📝 Key Points Coverage: 0.000
     🔗 Semantic Similarity: 0.623
--------------------------------------------------

📝 Test Case 3/5
❓ Question: Mi a különbség az alperes és a felperes között?
🏷️ Domain: po

In [8]:
# Analysis and reporting
print("📊 DETAILED EVALUATION ANALYSIS")
print("=" * 70)

if not evaluation_results:
    print("❌ No evaluation results to analyze!")
else:
    # 1. Overall Metrics Summary
    print("\n🎯 OVERALL PERFORMANCE METRICS")
    print("-" * 40)
    
    retrieval_precisions = [r['retrieval_metrics']['precision_at_k'] for r in evaluation_results]
    retrieval_recalls = [r['retrieval_metrics']['recall_at_k'] for r in evaluation_results]
    retrieval_f1s = [r['retrieval_metrics']['f1_at_k'] for r in evaluation_results]
    mrrs = [r['retrieval_metrics']['mrr'] for r in evaluation_results]
    
    key_points_coverages = [r['answer_metrics']['key_points_coverage'] for r in evaluation_results]
    semantic_similarities = [r['semantic_similarity'] for r in evaluation_results]
    
    print(f"📈 Retrieval Metrics:")
    print(f"   Precision@5:     {np.mean(retrieval_precisions):.3f} ± {np.std(retrieval_precisions):.3f}")
    print(f"   Recall@5:        {np.mean(retrieval_recalls):.3f} ± {np.std(retrieval_recalls):.3f}")
    print(f"   F1@5:            {np.mean(retrieval_f1s):.3f} ± {np.std(retrieval_f1s):.3f}")
    print(f"   MRR:             {np.mean(mrrs):.3f} ± {np.std(mrrs):.3f}")
    
    print(f"\n📝 Answer Quality Metrics:")
    print(f"   Key Points Coverage: {np.mean(key_points_coverages):.3f} ± {np.std(key_points_coverages):.3f}")
    print(f"   Semantic Similarity: {np.mean(semantic_similarities):.3f} ± {np.std(semantic_similarities):.3f}")
    
    # 2. Performance by Domain and Difficulty
    print(f"\n🏷️ PERFORMANCE BY DOMAIN")
    print("-" * 40)
    
    domains = {}
    for result in evaluation_results:
        domain = result['domain']
        if domain not in domains:
            domains[domain] = []
        domains[domain].append(result)
    
    for domain, results in domains.items():
        precisions = [r['retrieval_metrics']['precision_at_k'] for r in results]
        coverages = [r['answer_metrics']['key_points_coverage'] for r in results]
        
        print(f"📚 {domain.upper()}:")
        print(f"   Precision@5: {np.mean(precisions):.3f}")
        print(f"   Key Coverage: {np.mean(coverages):.3f}")
        print(f"   Test Cases: {len(results)}")
    
    # 3. Performance by Difficulty
    print(f"\n🎯 PERFORMANCE BY DIFFICULTY")
    print("-" * 40)
    
    difficulties = {}
    for result in evaluation_results:
        diff = result['difficulty']
        if diff not in difficulties:
            difficulties[diff] = []
        difficulties[diff].append(result)
    
    for difficulty, results in difficulties.items():
        precisions = [r['retrieval_metrics']['precision_at_k'] for r in results]
        coverages = [r['answer_metrics']['key_points_coverage'] for r in results]
        
        print(f"⚡ {difficulty.upper()}:")
        print(f"   Precision@5: {np.mean(precisions):.3f}")
        print(f"   Key Coverage: {np.mean(coverages):.3f}")
        print(f"   Test Cases: {len(results)}")


📊 DETAILED EVALUATION ANALYSIS

🎯 OVERALL PERFORMANCE METRICS
----------------------------------------
📈 Retrieval Metrics:
   Precision@5:     0.360 ± 0.080
   Recall@5:        0.767 ± 0.200
   F1@5:            0.486 ± 0.105
   MRR:             0.633 ± 0.306

📝 Answer Quality Metrics:
   Key Points Coverage: 0.000 ± 0.000
   Semantic Similarity: 0.617 ± 0.048

🏷️ PERFORMANCE BY DOMAIN
----------------------------------------
📚 BÜNTETŐJOG:
   Precision@5: 0.333
   Key Coverage: 0.000
   Test Cases: 3
📚 POLGÁRI JOG:
   Precision@5: 0.400
   Key Coverage: 0.000
   Test Cases: 2

🎯 PERFORMANCE BY DIFFICULTY
----------------------------------------
⚡ MEDIUM:
   Precision@5: 0.300
   Key Coverage: 0.000
   Test Cases: 2
⚡ HARD:
   Precision@5: 0.400
   Key Coverage: 0.000
   Test Cases: 2
⚡ EASY:
   Precision@5: 0.400
   Key Coverage: 0.000
   Test Cases: 1


In [9]:
# Generate evaluation report and recommendations
print(f"\n📊 EVALUATION SUMMARY & RECOMMENDATIONS")
print("=" * 70)

if evaluation_results:
    # Calculate overall scores
    avg_precision = np.mean([r['retrieval_metrics']['precision_at_k'] for r in evaluation_results])
    avg_coverage = np.mean([r['answer_metrics']['key_points_coverage'] for r in evaluation_results])
    avg_semantic = np.mean([r['semantic_similarity'] for r in evaluation_results])
    
    # Overall grade calculation
    overall_score = (avg_precision * 0.3 + avg_coverage * 0.4 + avg_semantic * 0.3) * 100
    
    print(f"🎯 OVERALL RAG SYSTEM GRADE: {overall_score:.1f}/100")
    
    if overall_score >= 80:
        grade = "EXCELLENT ⭐⭐⭐⭐⭐"
        print(f"   Status: {grade}")
        print("   🎉 Your RAG system performs excellently!")
    elif overall_score >= 70:
        grade = "GOOD ⭐⭐⭐⭐"
        print(f"   Status: {grade}")
        print("   ✅ Your RAG system performs well with room for improvement.")
    elif overall_score >= 60:
        grade = "FAIR ⭐⭐⭐"
        print(f"   Status: {grade}")
        print("   ⚠️ Your RAG system needs improvement.")
    else:
        grade = "POOR ⭐⭐"
        print(f"   Status: {grade}")
        print("   🚨 Your RAG system needs significant improvement.")
    
    print(f"\n📈 COMPONENT SCORES:")
    print(f"   Retrieval Quality:  {avg_precision*100:.1f}/100")
    print(f"   Answer Quality:     {avg_coverage*100:.1f}/100") 
    print(f"   Semantic Accuracy:  {avg_semantic*100:.1f}/100")
    
    # Specific recommendations
    print(f"\n🔧 IMPROVEMENT RECOMMENDATIONS:")
    
    if avg_precision < 0.7:
        print(f"   🔍 RETRIEVAL IMPROVEMENT NEEDED:")
        print(f"      - Consider improving embedding model or fine-tuning")
        print(f"      - Experiment with different chunk sizes")
        print(f"      - Add domain-specific preprocessing")
        print(f"      - Implement hybrid search (dense + sparse)")
    
    if avg_coverage < 0.7:
        print(f"   📝 ANSWER GENERATION IMPROVEMENT NEEDED:")
        print(f"      - Improve prompt engineering")
        print(f"      - Add more context to prompts")
        print(f"      - Consider few-shot examples in prompts")
        print(f"      - Fine-tune the LLM on legal domain")
    
    if avg_semantic < 0.7:
        print(f"   🎯 SEMANTIC ACCURACY IMPROVEMENT NEEDED:")
        print(f"      - Use domain-specific embeddings")
        print(f"      - Add legal terminology preprocessing")
        print(f"      - Implement reranking with legal-specific scoring")
    
    # Best and worst performing cases
    best_case = max(evaluation_results, key=lambda x: x['answer_metrics']['key_points_coverage'])
    worst_case = min(evaluation_results, key=lambda x: x['answer_metrics']['key_points_coverage'])
    
    print(f"\n🏆 BEST PERFORMING CASE:")
    print(f"   Question: {best_case['question']}")
    print(f"   Key Coverage: {best_case['answer_metrics']['key_points_coverage']:.3f}")
    print(f"   Domain: {best_case['domain']} | Difficulty: {best_case['difficulty']}")
    
    print(f"\n🎯 NEEDS IMPROVEMENT:")
    print(f"   Question: {worst_case['question']}")
    print(f"   Key Coverage: {worst_case['answer_metrics']['key_points_coverage']:.3f}")
    print(f"   Domain: {worst_case['domain']} | Difficulty: {worst_case['difficulty']}")

print(f"\n✅ RAG Quality Evaluation Complete!")
print(f"📁 Results saved in evaluation_results variable for further analysis.")



📊 EVALUATION SUMMARY & RECOMMENDATIONS
🎯 OVERALL RAG SYSTEM GRADE: 29.3/100
   Status: POOR ⭐⭐
   🚨 Your RAG system needs significant improvement.

📈 COMPONENT SCORES:
   Retrieval Quality:  36.0/100
   Answer Quality:     0.0/100
   Semantic Accuracy:  61.7/100

🔧 IMPROVEMENT RECOMMENDATIONS:
   🔍 RETRIEVAL IMPROVEMENT NEEDED:
      - Consider improving embedding model or fine-tuning
      - Experiment with different chunk sizes
      - Add domain-specific preprocessing
      - Implement hybrid search (dense + sparse)
   📝 ANSWER GENERATION IMPROVEMENT NEEDED:
      - Improve prompt engineering
      - Add more context to prompts
      - Consider few-shot examples in prompts
      - Fine-tune the LLM on legal domain
   🎯 SEMANTIC ACCURACY IMPROVEMENT NEEDED:
      - Use domain-specific embeddings
      - Add legal terminology preprocessing
      - Implement reranking with legal-specific scoring

🏆 BEST PERFORMING CASE:
   Question: Mi a bűnszervezet fogalma a Btk. szerint?
   Key Cov

In [10]:
# Detailed case-by-case analysis
print(f"\n📋 DETAILED CASE-BY-CASE ANALYSIS")
print("=" * 70)

for result in evaluation_results:
    print(f"\n🔍 Test Case {result['test_case']}: {result['domain']} ({result['difficulty']})")
    print(f"❓ Question: {result['question']}")
    
    # Retrieval analysis
    ret_metrics = result['retrieval_metrics']
    print(f"\n📊 Retrieval Performance:")
    print(f"   Precision@5: {ret_metrics['precision_at_k']:.3f}")
    print(f"   Recall@5: {ret_metrics['recall_at_k']:.3f}")
    print(f"   MRR: {ret_metrics['mrr']:.3f}")
    print(f"   Relevant Found: {ret_metrics['relevant_found']}/{ret_metrics['total_relevant']}")
    print(f"   Retrieved Docs: {result['retrieved_docs']}")
    
    # Answer analysis
    ans_metrics = result['answer_metrics']
    print(f"\n📝 Answer Quality:")
    print(f"   Key Points Coverage: {ans_metrics['key_points_coverage']:.3f}")
    print(f"   Found Key Points: {ans_metrics['key_points_found']}")
    print(f"   Legal Terms Used: {ans_metrics['legal_terms_used']}")
    print(f"   Has Structure: {ans_metrics['has_structured_response']}")
    print(f"   Has Citations: {ans_metrics['has_citations']} ({ans_metrics['citation_count']} citations)")
    print(f"   Word Count: {ans_metrics['word_count']}")
    print(f"   Hungarian Quality: {ans_metrics['hungarian_quality_score']:.3f}")
    print(f"   Semantic Similarity: {result['semantic_similarity']:.3f}")
    
    # Answer preview
    print(f"\n💬 Generated Answer (first 300 chars):")
    print(f"   {result['answer'][:300]}...")
    
    print("-" * 60)



📋 DETAILED CASE-BY-CASE ANALYSIS

🔍 Test Case 1: büntetőjog (medium)
❓ Question: Mi a bűnszervezet fogalma a Btk. szerint?

📊 Retrieval Performance:
   Precision@5: 0.200
   Recall@5: 0.500
   MRR: 0.333
   Relevant Found: 1/2
   Retrieved Docs: ['B.405/2015/56', 'M.8/2019/37', 'Pf.20069/2017/3', 'Kb.5/2008/42', 'P.20961/2011/3']

📝 Answer Quality:
   Key Points Coverage: 0.000
   Found Key Points: []
   Legal Terms Used: []
   Has Structure: False
   Has Citations: False (0 citations)
   Word Count: 9
   Hungarian Quality: 0.167
   Semantic Similarity: 0.591

💬 Generated Answer (first 300 chars):
   A megadott dokumentumok alapján a kérdés nem válaszolható meg....
------------------------------------------------------------

🔍 Test Case 2: büntetőjog (hard)
❓ Question: Milyen feltételei vannak a bűnszervezetben való részvételnek?

📊 Retrieval Performance:
   Precision@5: 0.400
   Recall@5: 0.667
   MRR: 1.000
   Relevant Found: 2/3
   Retrieved Docs: ['B.405/2015/56', 'M.8/2019/37', 

In [11]:
# Save evaluation results for future reference
import json
from datetime import datetime

# Prepare results for JSON serialization
serializable_results = []
for result in evaluation_results:
    serializable_result = {
        'timestamp': datetime.now().isoformat(),
        'test_case': result['test_case'],
        'question': result['question'],
        'domain': result['domain'],
        'difficulty': result['difficulty'],
        'answer': result['answer'],
        'retrieval_metrics': result['retrieval_metrics'],
        'answer_metrics': {
            k: v for k, v in result['answer_metrics'].items() 
            if k not in ['key_points_found', 'legal_terms_used']  # Remove lists for JSON
        },
        'semantic_similarity': result['semantic_similarity'],
        'retrieved_docs': result['retrieved_docs']
    }
    serializable_results.append(serializable_result)

# Save to file
output_file = "data/evaluation_results.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(serializable_results, f, ensure_ascii=False, indent=2)

print(f"💾 Evaluation results saved to: {output_file}")

# Generate a quick CSV summary for easier analysis
summary_data = []
for result in evaluation_results:
    summary_data.append({
        'question': result['question'][:50] + "...",
        'domain': result['domain'],
        'difficulty': result['difficulty'],
        'precision_at_5': result['retrieval_metrics']['precision_at_k'],
        'key_points_coverage': result['answer_metrics']['key_points_coverage'],
        'semantic_similarity': result['semantic_similarity'],
        'has_citations': result['answer_metrics']['has_citations'],
        'word_count': result['answer_metrics']['word_count']
    })

summary_df = pd.DataFrame(summary_data)
summary_file = "data/evaluation_summary.csv"
summary_df.to_csv(summary_file, index=False, encoding='utf-8')

print(f"📊 Evaluation summary saved to: {summary_file}")
print(f"\n🎯 Use these files to track RAG system improvements over time!")

# Display summary table
print(f"\n📋 EVALUATION SUMMARY TABLE:")
print(summary_df.to_string(index=False))


💾 Evaluation results saved to: data/evaluation_results.json
📊 Evaluation summary saved to: data/evaluation_summary.csv

🎯 Use these files to track RAG system improvements over time!

📋 EVALUATION SUMMARY TABLE:
                                             question      domain difficulty  precision_at_5  key_points_coverage  semantic_similarity  has_citations  word_count
         Mi a bűnszervezet fogalma a Btk. szerint?...  büntetőjog     medium             0.2                  0.0             0.590658          False           9
Milyen feltételei vannak a bűnszervezetben való ré...  büntetőjog       hard             0.4                  0.0             0.622942          False           9
   Mi a különbség az alperes és a felperes között?... polgári jog       easy             0.4                  0.0             0.702160           True         363
        Mikor alkalmazható a feltételes szabadság?...  büntetőjog     medium             0.4                  0.0             0.615080       