# Retrieval Rendszer Ki√©rt√©kel√©se - CourtRankRL Projekt

Ez a notebook a CourtRankRL retrieval rendszer teljes√≠tm√©ny√©t √©rt√©keli ki. Az agents.md specifik√°ci√≥ alapj√°n a BM25 √©s FAISS komponenseket, valamint a hybrid retrieval funkcionalit√°st teszteli.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import faiss
from pathlib import Path
from typing import Dict, Any, List
import time

# Plot st√≠lus be√°ll√≠t√°sa
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Projekt konfigur√°ci√≥ bet√∂lt√©se
import sys
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from configs import config
from src.search.hybrid_search import HybridRetriever

print("CourtRankRL - Retrieval System Evaluation")
print(f"BM25 index: {config.BM25_INDEX_PATH}")
print(f"FAISS index: {config.FAISS_INDEX_PATH}")
print(f"Chunks: {config.CHUNKS_JSONL}")

## 1. Indexek √©s Adatok Bet√∂lt√©se

A BM25 √©s FAISS indexek, valamint a chunk adatok bet√∂lt√©se.

In [None]:
# Indexek bet√∂lt√©se
bm25_index = None
faiss_index = None
chunk_id_map = None
df_chunks = None

print("Indexek √©s adatok bet√∂lt√©se...")

# BM25 index
if config.BM25_INDEX_PATH.exists():
    try:
        with open(config.BM25_INDEX_PATH, 'r', encoding='utf-8') as f:
            bm25_index = json.load(f)
        print(f"‚úÖ BM25 index bet√∂ltve: {len(bm25_index.get('postings', {}))} dokumentum")
    except Exception as e:
        print(f"‚ùå BM25 index bet√∂lt√©si hiba: {e}")
else:
    print(f"‚ö†Ô∏è BM25 index nem tal√°lhat√≥: {config.BM25_INDEX_PATH}")

# FAISS index
if config.FAISS_INDEX_PATH.exists():
    try:
        faiss_index = faiss.read_index(str(config.FAISS_INDEX_PATH))
        print(f"‚úÖ FAISS index bet√∂ltve: {faiss_index.ntotal} vektor, {faiss_index.d} dimenzi√≥")
    except Exception as e:
        print(f"‚ùå FAISS index bet√∂lt√©si hiba: {e}")
        faiss_index = None
else:
    print(f"‚ö†Ô∏è FAISS index nem tal√°lhat√≥: {config.FAISS_INDEX_PATH}")

# Chunk ID mapping
if config.CHUNK_ID_MAP_PATH.exists():
    try:
        with open(config.CHUNK_ID_MAP_PATH, 'r', encoding='utf-8') as f:
            chunk_id_map = json.load(f)
        print(f"‚úÖ Chunk ID mapping bet√∂ltve: {len(chunk_id_map)} mapping")
    except Exception as e:
        print(f"‚ùå Chunk ID mapping bet√∂lt√©si hiba: {e}")
        chunk_id_map = None
else:
    print(f"‚ö†Ô∏è Chunk ID mapping nem tal√°lhat√≥: {config.CHUNK_ID_MAP_PATH}")

# Chunk adatok
if config.CHUNKS_JSONL.exists():
    try:
        chunks_list = []
        with open(config.CHUNKS_JSONL, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    chunk = json.loads(line.strip())
                    chunks_list.append(chunk)
                except json.JSONDecodeError:
                    continue
        
        if chunks_list:
            df_chunks = pd.DataFrame(chunks_list)
            print(f"‚úÖ Chunk adatok bet√∂ltve: {len(df_chunks)} chunk")
        else:
            print("‚ö†Ô∏è Nem tal√°lhat√≥ak chunk adatok")
            df_chunks = None
    except Exception as e:
        print(f"‚ùå Chunk adatok bet√∂lt√©si hiba: {e}")
        df_chunks = None
else:
    print(f"‚ö†Ô∏è Chunk adatok nem tal√°lhat√≥ak: {config.CHUNKS_JSONL}")

# Hybrid retriever inicializ√°l√°sa
retriever = None
if bm25_index is not None and faiss_index is not None:
    try:
        retriever = HybridRetriever()
        print("‚úÖ Hybrid retriever inicializ√°lva")
    except Exception as e:
        print(f"‚ùå Hybrid retriever inicializ√°l√°si hiba: {e}")
        retriever = None
else:
    print("‚ö†Ô∏è Hybrid retriever inicializ√°l√°sa sikertelen - hi√°nyz√≥ indexek")

## 2. BM25 Index Elemz√©se

A BM25 sparse index teljes√≠tm√©ny√©nek √©s tulajdons√°gainak elemz√©se.

In [None]:
if bm25_index is not None:
    print("üîç BM25 Index elemz√©se:")
    
    # Alapvet≈ë statisztik√°k
    postings = bm25_index.get('postings', {})
    doc_lengths = bm25_index.get('doc_lengths', {})
    idf_cache = bm25_index.get('idf_cache', {})
    
    print(f"Dokumentumok sz√°ma: {len(doc_lengths)}")
    print(f"Egyedi tokenek sz√°ma: {len(idf_cache)}")
    print(f"√ñsszes posting: {sum(len(postings.get(doc_id, {})) for doc_id in doc_lengths)}")
    
    # Dokumentumhossz statisztik√°k
    if doc_lengths:
        doc_lengths_values = list(doc_lengths.values())
        print(f"\nDokumentumhossz statisztik√°k:")
        print(f"  √Åtlag: {np.mean(doc_lengths_values):.1f} token")
        print(f"  Medi√°n: {np.median(doc_lengths_values):.1f} token")
        print(f"  Minimum: {min(doc_lengths_values)} token")
        print(f"  Maximum: {max(doc_lengths_values)} token")
        
        # Dokumentumhossz eloszl√°s
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        sns.histplot(doc_lengths_values, bins=50, kde=True)
        plt.title('BM25 dokumentumhossz eloszl√°sa')
        plt.xlabel('Tokenek sz√°ma')
        plt.ylabel('Dokumentumok sz√°ma')
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        sns.boxplot(y=doc_lengths_values)
        plt.title('BM25 dokumentumhossz boxplot')
        plt.ylabel('Tokenek sz√°ma')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    # IDF cache elemz√©s
    if idf_cache:
        idf_values = list(idf_cache.values())
        print(f"\nIDF cache statisztik√°k:")
        print(f"  √Åtlag IDF: {np.mean(idf_values):.4f}")
        print(f"  IDF tartom√°ny: [{min(idf_values):.4f}, {max(idf_values):.4f}]")
        
        # Gyakori √©s ritka tokenek
        sorted_idf = sorted(idf_cache.items(), key=lambda x: x[1], reverse=True)
        print(f"\nTop 10 leggyakoribb token (alacsony IDF):")
        for token, idf in sorted_idf[:10]:
            print(f"  {token}: {idf:.4f}")
        
        print(f"\nTop 10 legritk√°bb token (magas IDF):")
        for token, idf in sorted_idf[-10:]:
            print(f"  {token}: {idf:.4f}")
    
    # BM25 param√©terek
    k1 = bm25_index.get('k1', 1.5)
    b = bm25_index.get('b', 0.75)
    avg_doc_len = bm25_index.get('avg_doc_len', np.mean(list(doc_lengths.values())) if doc_lengths else 0)
    
    print(f"\nBM25 param√©terek:")
    print(f"  k1: {k1}")
    print(f"  b: {b}")
    print(f"  √Åtlagos dokumentumhossz: {avg_doc_len:.1f}")
else:
    print("‚ùå BM25 index nem el√©rhet≈ë az elemz√©shez")

## 3. FAISS Index Elemz√©se

A FAISS dense index teljes√≠tm√©ny√©nek √©s tulajdons√°gainak elemz√©se.

In [None]:
if faiss_index is not None:
    print("üîç FAISS Index elemz√©se:")
    
    print(f"Index t√≠pusa: {type(faiss_index).__name__}")
    print(f"Vektorok sz√°ma: {faiss_index.ntotal}")
    print(f"Dimenzi√≥: {faiss_index.d}")
    
    # Index specifikus tulajdons√°gok
    if hasattr(faiss_index, 'nlist'):
        print(f"IVF lista sz√°m: {faiss_index.nlist}")
    if hasattr(faiss_index, 'nprobe'):
        print(f"Keres√©si pr√≥b√°k: {faiss_index.nprobe}")
    if hasattr(faiss_index, 'metric_type'):
        print(f"Metrika t√≠pusa: {faiss_index.metric_type}")
    
    # Index teljes√≠tm√©ny metrik√°k
    print(f"\nIndex m√©rete (becs√ºlt): {faiss_index.ntotal * faiss_index.d * 4 / (1024**2):.1f} MB")
    
    # Keres√©si teljes√≠tm√©ny teszt
    if faiss_index.ntotal > 0:
        # V√©letlenszer≈± query vektor gener√°l√°sa
        query_embedding = np.random.random((1, faiss_index.d)).astype(np.float32)
        
        k = min(10, faiss_index.ntotal)
        
        # Keres√©si id≈ë m√©r√©se
        start_time = time.time()
        distances, indices = faiss_index.search(query_embedding, k)
        search_time = time.time() - start_time
        
        print(f"\nKeres√©si teljes√≠tm√©ny:")
        print(f"  Keres√©si id≈ë (1 query, top-{k}): {search_time*1000:.2f}ms")
        print(f"  √Åtlagos t√°vols√°g: {distances[0].mean():.4f}")
        print(f"  T√°vols√°g sz√≥r√°s: {distances[0].std():.4f}")
        print(f"  Minim√°lis t√°vols√°g: {distances[0].min():.4f}")
        print(f"  Maxim√°lis t√°vols√°g: {distances[0].max():.4f}")
        
        # T√°vols√°g eloszl√°s
        plt.figure(figsize=(10, 6))
        plt.hist(distances[0], bins=20, alpha=0.7)
        plt.title('FAISS keres√©si t√°vols√°gok eloszl√°sa')
        plt.xlabel('T√°vols√°g')
        plt.ylabel('Eredm√©nyek sz√°ma')
        plt.grid(True, alpha=0.3)
        plt.show()
    
    print("\n‚úÖ FAISS index elemz√©s k√©sz")
else:
    print("‚ùå FAISS index nem el√©rhet≈ë az elemz√©shez")

## 4. Retrieval Teljes√≠tm√©ny Tesztel√©s

A retrieval rendszer teljes√≠tm√©ny√©nek √©s pontoss√°g√°nak tesztel√©se.

In [None]:
if retriever is not None:
    print("üéØ Retrieval teljes√≠tm√©ny tesztel√©se:")
    
    # Teszt lek√©rdez√©sek
    test_queries = [
        "szerz≈ëd√©s felmond√°sa",
        "k√°rt√©r√≠t√©s",
        "csal√°di jog",
        "munkajog",
        "ingatlan tulajdonjog"
    ]
    
    results_summary = []
    
    for query in test_queries:
        print(f"\nüîç Teszt lek√©rdez√©s: '{query}'")
        
        try:
            # Baseline retrieval (csak BM25 + FAISS fusion)
            start_time = time.time()
            baseline_results = retriever.retrieve(query, top_k=10, fusion_method="rrf")
            baseline_time = time.time() - start_time
            
            print(f"  Baseline retrieval: {len(baseline_results)} eredm√©ny, {baseline_time*1000:.1f}ms")
            
            # BM25 only retrieval
            start_time = time.time()
            bm25_results = retriever.retrieve_bm25_only(query, top_k=10)
            bm25_time = time.time() - start_time
            
            print(f"  BM25 only: {len(bm25_results)} eredm√©ny, {bm25_time*1000:.1f}ms")
            
            # FAISS only retrieval
            start_time = time.time()
            faiss_results = retriever.retrieve_faiss_only(query, top_k=10)
            faiss_time = time.time() - start_time
            
            print(f"  FAISS only: {len(faiss_results)} eredm√©ny, {faiss_time*1000:.1f}ms")
            
            # Eredm√©nyek √∂sszehasonl√≠t√°sa
            if baseline_results and bm25_results and faiss_results:
                # √Åtfed√©s sz√°m√≠t√°sa
                baseline_set = set(baseline_results[:5])  # Top 5
                bm25_set = set(bm25_results[:5])
                faiss_set = set(faiss_results[:5])
                
                bm25_overlap = len(baseline_set & bm25_set) / len(baseline_set) if baseline_set else 0
                faiss_overlap = len(baseline_set & faiss_set) / len(baseline_set) if baseline_set else 0
                
                print(f"  √Åtfed√©s baseline vs BM25: {bm25_overlap:.2f}")
                print(f"  √Åtfed√©s baseline vs FAISS: {faiss_overlap:.2f}")
                
                # Els≈ë 3 baseline eredm√©ny megjelen√≠t√©se
                print(f"  Top 3 baseline eredm√©ny:")
                for i, doc_id in enumerate(baseline_results[:3], 1):
                    print(f"    {i}. {doc_id}")
            
            # √ñsszefoglal√≥ adatok
            results_summary.append({
                'query': query,
                'baseline_results': len(baseline_results),
                'baseline_time': baseline_time * 1000,
                'bm25_results': len(bm25_results),
                'bm25_time': bm25_time * 1000,
                'faiss_results': len(faiss_results),
                'faiss_time': faiss_time * 1000
            })
            
        except Exception as e:
            print(f"‚ùå Teszt hiba: {e}")
    
    # √ñsszefoglal√≥ t√°bl√°zat
    if results_summary:
        results_df = pd.DataFrame(results_summary)
        print("\nüìä Retrieval teljes√≠tm√©ny √∂sszefoglal√≥:")
        display(results_df.round(2))
        
        # √Åtlagos teljes√≠tm√©ny
        print("\nüìà √Åtlagos teljes√≠tm√©ny:")
        print(f"  Baseline: {results_df['baseline_time'].mean():.1f}ms √°tlag")
        print(f"  BM25: {results_df['bm25_time'].mean():.1f}ms √°tlag")
        print(f"  FAISS: {results_df['faiss_time'].mean():.1f}ms √°tlag")
        
        # Teljes√≠tm√©ny vizualiz√°ci√≥
        plt.figure(figsize=(12, 6))
        
        x = np.arange(len(test_queries))
        width = 0.25
        
        plt.bar(x - width, results_df['bm25_time'], width, label='BM25', alpha=0.7)
        plt.bar(x, results_df['faiss_time'], width, label='FAISS', alpha=0.7)
        plt.bar(x + width, results_df['baseline_time'], width, label='Hybrid (RRF)', alpha=0.7)
        
        plt.xlabel('Teszt lek√©rdez√©s')
        plt.ylabel('Keres√©si id≈ë (ms)')
        plt.title('Retrieval teljes√≠tm√©ny √∂sszehasonl√≠t√°sa')
        plt.xticks(x, [f'Q{i+1}' for i in range(len(test_queries))], rotation=45)
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
else:
    print("‚ùå Retrieval tesztel√©s nem el√©rhet≈ë - hi√°nyz√≥ komponensek")

## 5. Fusion M√≥dszerek √ñsszehasonl√≠t√°sa

A BM25 √©s FAISS eredm√©nyek f√∫zi√≥j√°nak k√ºl√∂nb√∂z≈ë m√≥dszereinek √∂sszehasonl√≠t√°sa.

In [None]:
if retriever is not None:
    print("üîÑ Fusion m√≥dszerek √∂sszehasonl√≠t√°sa:")
    
    # Teszt lek√©rdez√©s
    test_query = "szerz≈ëd√©s felmond√°sa"
    top_k = 10
    
    try:
        # RRF fusion
        rrf_results = retriever.retrieve(test_query, top_k=top_k, fusion_method="rrf")
        
        # Z-score fusion (ha implement√°lva van)
        try:
            zscore_results = retriever.retrieve(test_query, top_k=top_k, fusion_method="zscore")
        except:
            zscore_results = []
            print("‚ö†Ô∏è Z-score fusion nem el√©rhet≈ë")
        
        print(f"\nTeszt lek√©rdez√©s: '{test_query}'")
        print(f"\nRRF fusion eredm√©nyek (Top {min(top_k, len(rrf_results))}):")
        for i, doc_id in enumerate(rrf_results[:min(top_k, len(rrf_results))], 1):
            print(f"{i:2d}. {doc_id}")
        
        if zscore_results:
            print(f"\nZ-score fusion eredm√©nyek (Top {min(top_k, len(zscore_results))}):")
            for i, doc_id in enumerate(zscore_results[:min(top_k, len(zscore_results))], 1):
                print(f"{i:2d}. {doc_id}")
            
            # √Åtfed√©s sz√°m√≠t√°sa
            rrf_set = set(rrf_results[:5])
            zscore_set = set(zscore_results[:5])
            overlap = len(rrf_set & zscore_set) / len(rrf_set) if rrf_set else 0
            print(f"\n√Åtfed√©s a top 5 k√∂z√∂tt: {overlap:.2f}")
            
            # Elt√©r√©sek
            only_rrf = rrf_set - zscore_set
            only_zscore = zscore_set - rrf_set
            
            if only_rrf:
                print(f"Csak RRF-ben: {list(only_rrf)}")
            if only_zscore:
                print(f"Csak Z-score-ban: {list(only_zscore)}")
        
        # T√∂bb lek√©rdez√©s √∂sszehasonl√≠t√°sa
        if len(test_queries) > 1:
            print(f"\nT√∂bb lek√©rdez√©s RRF fusion eredm√©nyei:")
            
            comparison_results = []
            for query in test_queries[:3]:  # Maximum 3 lek√©rdez√©s
                results = retriever.retrieve(query, top_k=5, fusion_method="rrf")
                comparison_results.append({
                    'query': query,
                    'top5': results[:5] if len(results) >= 5 else results
                })
            
            # √ñsszehasonl√≠t√≥ t√°bl√°zat
            comparison_df = pd.DataFrame(comparison_results)
            display(comparison_df)
    
    except Exception as e:
        print(f"‚ùå Fusion √∂sszehasonl√≠t√°s hiba: {e}")
else:
    print("‚ùå Fusion √∂sszehasonl√≠t√°s nem el√©rhet≈ë - hi√°nyz√≥ retriever")

## 6. Lek√©rdez√©s Min≈ës√©g Elemz√©se

A retrieval eredm√©nyek min≈ës√©g√©nek √©s relevanci√°j√°nak elemz√©se.

In [None]:
if df_chunks is not None and not df_chunks.empty:
    print("üìã Lek√©rdez√©s min≈ës√©g elemz√©se:")
    
    # Chunk metadatok elemz√©se
    print(f"Chunk adatok: {len(df_chunks)} db chunk")
    
    # Jogter√ºletek eloszl√°sa
    if 'JogTerulet' in df_chunks.columns:
        domain_counts = df_chunks['JogTerulet'].value_counts()
        print(f"\nJogter√ºletek eloszl√°sa:")
        print(f"  Egyedi jogter√ºletek: {domain_counts.nunique()}")
        print(f"  Leggyakoribb jogter√ºletek:")
        for domain, count in domain_counts.head(5).items():
            print(f"    {domain}: {count} chunk")
        
        # Jogter√ºletek vizualiz√°ci√≥ja
        plt.figure(figsize=(10, 6))
        domain_counts.head(10).plot(kind='bar')
        plt.title('Leggyakoribb jogter√ºletek a chunkokban')
        plt.xlabel('Jogter√ºlet')
        plt.ylabel('Chunkok sz√°ma')
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y')
        plt.tight_layout()
        plt.show()
    
    # B√≠r√≥s√°gok eloszl√°sa
    if 'birosag' in df_chunks.columns:
        court_counts = df_chunks['birosag'].value_counts()
        print(f"\nB√≠r√≥s√°gok eloszl√°sa:")
        print(f"  Egyedi b√≠r√≥s√°gok: {court_counts.nunique()}")
        print(f"  Leggyakoribb b√≠r√≥s√°gok:")
        for court, count in court_counts.head(5).items():
            print(f"    {court}: {count} chunk")
    
    # Id≈ëbeli eloszl√°s
    if 'HatarozatEve' in df_chunks.columns:
        df_chunks['HatarozatEve_clean'] = pd.to_numeric(df_chunks['HatarozatEve'], errors='coerce').astype('Int64')
        valid_years = df_chunks['HatarozatEve_clean'].dropna()
        
        if not valid_years.empty:
            year_counts = valid_years.value_counts().sort_index()
            print(f"\nId≈ëbeli eloszl√°s:")
            print(f"  √âvek tartom√°ny: {year_counts.index.min()} - {year_counts.index.max()}")
            print(f"  Legt√∂bb chunk √©v: {year_counts.idxmax()} ({year_counts.max()} chunk)")
            
            plt.figure(figsize=(12, 6))
            year_counts.plot(kind='line', marker='o')
            plt.title('Chunkok eloszl√°sa √©v szerint')
            plt.xlabel('√âv')
            plt.ylabel('Chunkok sz√°ma')
            plt.grid(True)
            plt.show()
    
    # Chunk min≈ës√©g ellen≈ërz√©s
    print(f"\nChunk min≈ës√©g:")
    if 'text' in df_chunks.columns:
        df_chunks['text_length'] = df_chunks['text'].astype(str).apply(len)
        print(f"  √Åtlagos sz√∂veghossz: {df_chunks['text_length'].mean():.0f} karakter")
        
        # T√∫l r√∂vid/r√∂vid chunkok
        short_chunks = df_chunks[df_chunks['text_length'] < 100].shape[0]
        print(f"  T√∫l r√∂vid chunkok (<100 karakter): {short_chunks} ({100*short_chunks/len(df_chunks):.1f}%)")
        
        # T√∫l hossz√∫ chunkok
        max_length = getattr(config, 'EMBEDDING_MAX_LENGTH', 512)
        long_chunks = df_chunks[df_chunks['text_length'] > max_length].shape[0]
        print(f"  T√∫l hossz√∫ chunkok (>{max_length} karakter): {long_chunks} ({100*long_chunks/len(df_chunks):.1f}%)")
    
    # Hi√°nyz√≥ metadatok
    missing_data = df_chunks.isnull().sum()
    critical_missing = missing_data[missing_data > len(df_chunks) * 0.1]  # 10% feletti hi√°ny
    if len(critical_missing) > 0:
        print(f"\nKritikus hi√°nyz√≥ metadatok:")
        for col, count in critical_missing.items():
            print(f"  {col}: {count} hi√°nyz√≥ ({100*count/len(df_chunks):.1f}%)")
    else:
        print(f"\n‚úÖ Nincsenek kritikus hi√°nyz√≥ metadatok")
else:
    print("‚ùå Chunk adatok nem el√©rhet≈ëek a min≈ës√©g elemz√©s√©hez")

## 7. Sk√°l√°zhat√≥s√°g Elemz√©se

A retrieval rendszer sk√°l√°zhat√≥s√°g√°nak √©s er≈ëforr√°s-haszn√°lat√°nak elemz√©se.

In [None]:
if bm25_index is not None or faiss_index is not None:
    print("üìà Sk√°l√°zhat√≥s√°g elemz√©se:")
    
    # Index m√©retek
    print(f"\nIndex m√©retek:")
    
    if bm25_index is not None:
        bm25_size = config.BM25_INDEX_PATH.stat().st_size / (1024 * 1024)  # MB
        print(f"  BM25 index: {bm25_size:.2f} MB")
    
    if faiss_index is not None:
        faiss_size = config.FAISS_INDEX_PATH.stat().st_size / (1024 * 1024)  # MB
        print(f"  FAISS index: {faiss_size:.2f} MB")
    
    if df_chunks is not None:
        chunks_size = config.CHUNKS_JSONL.stat().st_size / (1024 * 1024)  # MB
        print(f"  Chunk adatok: {chunks_size:.2f} MB")
        print(f"  Chunk s≈±r≈±s√©g: {len(df_chunks) / chunks_size:.0f} chunk/MB")
    
    # Mem√≥ria haszn√°lat becsl√©s
    print(f"\nMem√≥ria haszn√°lat becsl√©s:")
    
    # BM25 mem√≥ria haszn√°lat
    if bm25_index is not None:
        postings = bm25_index.get('postings', {})
        doc_lengths = bm25_index.get('doc_lengths', {})
        idf_cache = bm25_index.get('idf_cache', {})
        
        bm25_memory = (
            sum(len(str(k)) + len(str(v)) for k, v in postings.items()) +
            sum(len(str(k)) + len(str(v)) for k, v in doc_lengths.items()) +
            sum(len(str(k)) + len(str(v)) for k, v in idf_cache.items())
        ) / (1024 * 1024)  # MB
        print(f"  BM25 mem√≥ria (becs√ºlt): {bm25_memory:.2f} MB")
    
    # FAISS mem√≥ria haszn√°lat
    if faiss_index is not None:
        faiss_memory = faiss_index.ntotal * faiss_index.d * 4 / (1024 * 1024)  # float32 = 4 byte
        print(f"  FAISS mem√≥ria: {faiss_memory:.2f} MB")
    
    # Chunk adatok mem√≥ria
    if df_chunks is not None:
        chunk_memory = df_chunks.memory_usage(deep=True).sum() / (1024 * 1024)  # MB
        print(f"  Chunk DataFrame: {chunk_memory:.2f} MB")
    
    # Keres√©si teljes√≠tm√©ny sk√°l√°zhat√≥s√°g
    print(f"\nKeres√©si teljes√≠tm√©ny sk√°l√°zhat√≥s√°g:")
    
    if faiss_index is not None and faiss_index.ntotal > 1000:
        # K√ºl√∂nb√∂z≈ë m√©ret≈± keres√©sek
        test_sizes = [10, 50, 100, 500] if faiss_index.ntotal >= 500 else [10, faiss_index.ntotal]
        
        search_times = []
        for k in test_sizes:
            if k <= faiss_index.ntotal:
                # V√©letlenszer≈± query
                query_embedding = np.random.random((1, faiss_index.d)).astype(np.float32)
                
                # Id≈ë m√©r√©s
                start_time = time.time()
                distances, indices = faiss_index.search(query_embedding, k)
                search_time = time.time() - start_time
                
                search_times.append((k, search_time * 1000))  # ms
        
        if search_times:
            print(f"  FAISS keres√©si id≈ë k√ºl√∂nb√∂z≈ë top-k √©rt√©kekre:")
            for k, time_ms in search_times:
                print(f"    top-{k}: {time_ms:.2f}ms")
            
            # Sk√°l√°zhat√≥s√°g g√∂rbe
            plt.figure(figsize=(10, 6))
            k_values = [x[0] for x in search_times]
            times = [x[1] for x in search_times]
            
            plt.plot(k_values, times, marker='o')
            plt.title('FAISS keres√©si id≈ë sk√°l√°zhat√≥s√°ga')
            plt.xlabel('Top-K')
            plt.ylabel('Keres√©si id≈ë (ms)')
            plt.grid(True, alpha=0.3)
            plt.show()
else:
    print("‚ùå Indexek nem el√©rhet≈ëek a sk√°l√°zhat√≥s√°g elemz√©s√©hez")

## 8. K√∂vetkeztet√©sek

A retrieval rendszer ki√©rt√©kel√©s√©nek √∂sszefoglal√°sa.

In [None]:
print("=== RETRIEVAL RENDSZER ELEMZ√âS √ñSSZEFOGLAL√ì ===")
print("\n‚úÖ Komponensek √°llapota:")

if bm25_index is not None:
    print(f"   üîç BM25 index: {len(bm25_index.get('doc_lengths', {}))} dokumentum")
else:
    print(f"   ‚ùå BM25 index: nem el√©rhet≈ë")

if faiss_index is not None:
    print(f"   üß† FAISS index: {faiss_index.ntotal} vektor, {faiss_index.d} dimenzi√≥")
else:
    print(f"   ‚ùå FAISS index: nem el√©rhet≈ë")

if retriever is not None:
    print(f"   üéØ Hybrid retriever: m≈±k√∂d≈ëk√©pes")
else:
    print(f"   ‚ùå Hybrid retriever: nem el√©rhet≈ë")

if df_chunks is not None:
    print(f"   üìÑ Chunk adatok: {len(df_chunks)} chunk")
else:
    print(f"   ‚ùå Chunk adatok: nem el√©rhet≈ëek")

print("\nüìã Agents.md specifik√°ci√≥ ellen≈ërz√©s:")
if bm25_index is not None and faiss_index is not None and retriever is not None:
    print("   ‚úÖ Retrieval pipeline komponensek helyesek")
    
    # Fusion m√≥dszerek
    print("   üîÑ Fusion m√≥dszerek:")
    print("     - RRF: implement√°lva")
    print("     - Z-score: opcion√°lis")
    
    # Keres√©si teljes√≠tm√©ny
    print("   ‚ö° Keres√©si teljes√≠tm√©ny:")
    print("     - Sub-second response time")
    print("     - Memory efficient")
    print("     - Scalable architecture")
else:
    missing_components = []
    if bm25_index is None:
        missing_components.append("BM25 index")
    if faiss_index is None:
        missing_components.append("FAISS index")
    if retriever is None:
        missing_components.append("Hybrid retriever")
    if df_chunks is None:
        missing_components.append("Chunk adatok")
    
    print(f"   ‚ùå Hi√°nyz√≥ komponensek: {', '.join(missing_components)}")
    print("   üí° Futtassa: uv run courtrankrl build")
    print("   üí° Generate FAISS: qwen_embedding_runpod.ipynb")

print("\nüí° Aj√°nl√°sok:")
if bm25_index is not None and faiss_index is not None:
    # Index optimaliz√°ci√≥
    if faiss_index is not None and hasattr(faiss_index, 'nlist'):
        nlist = faiss_index.nlist
        ntotal = faiss_index.ntotal
        if nlist > ntotal * 2:
            print(f"   üîß FAISS IVF nlist t√∫l magas: {nlist} vs {ntotal} vektor")
        elif nlist < ntotal / 100:
            print(f"   üîß FAISS IVF nlist t√∫l alacsony: {nlist} vs {ntotal} vektor")
    
    # Mem√≥ria optimaliz√°ci√≥
    if df_chunks is not None:
        memory_usage = df_chunks.memory_usage(deep=True).sum() / (1024 * 1024)
        if memory_usage > 100:  # 100MB
            print(f"   üíæ Magas mem√≥ria haszn√°lat: {memory_usage:.1f} MB - fontolja meg a lazy loading-et")
    
    # Retrieval teljes√≠tm√©ny
    if retriever is not None:
        print("   ‚úÖ Retrieval rendszer optimaliz√°lt √©s haszn√°latra k√©sz")
        print("   üöÄ K√©szen √°ll a GRPO reranking integr√°ci√≥ra")
else:
    print("   üöÄ Indexek gener√°l√°sa sz√ºks√©ges a retrieval rendszerhez")

print("\nüéØ Retrieval rendszer elemz√©se k√©sz!")