# FAISS Embeddingek Ki√©rt√©kel√©se - CourtRankRL Projekt

Ez a notebook a google/embeddinggemma-300m modellel gener√°lt FAISS indexben t√°rolt embeddingeket elemzi. Az agents.md specifik√°ci√≥ alapj√°n k√©sz√≠tett ki√©rt√©kel√©si szempontokat vizsg√°lja a jelenlegi adatokkal.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import json
import faiss
from pathlib import Path
from typing import List, Dict, Any

# Plot st√≠lus be√°ll√≠t√°sa
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Projekt konfigur√°ci√≥ bet√∂lt√©se
import sys
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from configs import config

print("CourtRankRL - FAISS Embedding Analysis")
print(f"Embedding model: {config.QWEN3_MODEL_NAME}")
print(f"Embedding dimension: {config.EMBEDDING_DIMENSION}")

## 1. FAISS Index √©s Embeddingek Bet√∂lt√©se

A jelenlegi projektben tal√°lhat√≥ FAISS index √©s chunk ID mapping bet√∂lt√©se.

In [None]:
# FAISS index bet√∂lt√©se
faiss_path = config.FAISS_INDEX_PATH
chunk_map_path = config.CHUNK_ID_MAP_PATH

print(f"FAISS index bet√∂lt√©se: {faiss_path}")
print(f"Chunk ID mapping bet√∂lt√©se: {chunk_map_path}")

index = None
chunk_id_map = None
embeddings = None

try:
    if faiss_path.exists():
        index = faiss.read_index(str(faiss_path))
        print(f"‚úÖ FAISS index bet√∂ltve: {index.ntotal} vektor, {index.d} dimenzi√≥")
        
        # Embedding dimenzi√≥ ellen≈ërz√©se
        if hasattr(index, 'd') and index.d != config.EMBEDDING_DIMENSION:
            print(f"‚ö†Ô∏è  Embedding dimenzi√≥ elt√©r√©s: index={index.d}, config={config.EMBEDDING_DIMENSION}")
        
        # Embeddingek kivon√°sa (minden vektor)
        embeddings = []
        for i in range(index.ntotal):
            embedding = index.reconstruct(i)
            embeddings.append(embedding)
        embeddings = np.array(embeddings)
        print(f"‚úÖ Embeddingek kivonva: {embeddings.shape}")
    else:
        print(f"‚ùå FAISS index nem tal√°lhat√≥: {faiss_path}")
        print("Futtassa a gemma_embedding_runpod.ipynb-t el≈ësz√∂r!")
except Exception as e:
    print(f"‚ùå Hiba a FAISS index bet√∂lt√©se sor√°n: {e}")
    index = None

# Chunk ID mapping bet√∂lt√©se
try:
    if chunk_map_path.exists():
        with open(chunk_map_path, 'r', encoding='utf-8') as f:
            chunk_id_map = json.load(f)
        print(f"‚úÖ Chunk ID map bet√∂ltve: {len(chunk_id_map)} mapping")
    else:
        print(f"‚ùå Chunk ID map nem tal√°lhat√≥: {chunk_map_path}")
except Exception as e:
    print(f"‚ùå Hiba a chunk ID map bet√∂lt√©se sor√°n: {e}")
    chunk_id_map = None

## 2. Chunk Adatok Bet√∂lt√©se

A chunks.jsonl f√°jl bet√∂lt√©se, hogy √∂sszekapcsoljuk az embeddingeket a sz√∂vegekkel √©s metadatokkal.

In [None]:
# Chunkok bet√∂lt√©se mintav√©telez√©ssel a teljes√≠tm√©ny √©rdek√©ben
chunks_file = config.CHUNKS_JSONL
sample_size = min(10000, index.ntotal if index else 0)  # Maximum 10k chunk elemz√©sre

df = None

if chunks_file.exists() and index is not None and chunk_id_map is not None:
    try:
        print(f"üìä Mintav√©tel: {sample_size} chunk bet√∂lt√©se...")
        
        chunks_list = []
        chunk_ids = list(chunk_id_map.values())[:sample_size]
        
        with open(chunks_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    chunk = json.loads(line.strip())
                    if chunk.get('chunk_id') in chunk_ids:
                        chunks_list.append(chunk)
                except json.JSONDecodeError:
                    continue
                if len(chunks_list) >= sample_size:
                    break
        
        if chunks_list:
            df = pd.DataFrame(chunks_list)
            print(f"‚úÖ Bet√∂lt√∂tt chunkok sz√°ma: {len(df)}")
            
            # Embeddingek hozz√°rendel√©se a chunkokhoz
            if embeddings is not None:
                try:
                    embedding_dict = {chunk_id_map[str(i)]: embeddings[i] for i in range(len(embeddings))}
                    df = df.assign(embedding=df['chunk_id'].map(embedding_dict))
                    valid_embeddings = df['embedding'].notna().sum()
                    print(f"‚úÖ Embeddingek hozz√°rendelve: {valid_embeddings} chunk")
                except Exception as e:
                    print(f"‚ùå Hiba az embeddingek hozz√°rendel√©se sor√°n: {e}")
            else:
                print("‚ö†Ô∏è Nincsenek embeddingek a hozz√°rendel√©shez")
        else:
            print("‚ö†Ô∏è Nem tal√°lhat√≥ak megfelel≈ë chunkok")
    
    except Exception as e:
        print(f"‚ùå Hiba a chunkok bet√∂lt√©se sor√°n: {e}")
        df = None
else:
    print("‚ö†Ô∏è Hi√°nyz√≥ adatok a chunk bet√∂lt√©shez:")
    if not chunks_file.exists():
        print(f"  - Chunks f√°jl nem tal√°lhat√≥: {chunks_file}")
    if index is None:
        print("  - FAISS index")
    if chunk_id_map is None:
        print("  - Chunk ID map")
    df = None

# Ellen≈ërz√©s
if df is not None and not df.empty:
    print(f"\nAdatok bet√∂ltve: {df.shape[0]} chunk, {df.shape[1]} oszlop")
    print(f"Oszlopok: {df.columns.tolist()}")
else:
    print("\n‚ùå Nincs adat az elemz√©shez")

## 3. Embedding Min≈ës√©g Ellen≈ërz√©se

Az agents.md specifik√°ci√≥ szerint L2-normaliz√°lt embeddingek sz√ºks√©gesek a FAISS IP metrik√°hoz.

In [None]:
if df is not None and 'embedding' in df.columns and embeddings is not None:
    print("üîç Embeddingek min≈ës√©gi ellen≈ërz√©se:")
    
    # Embedding dimenzi√≥ ellen≈ërz√©se
    first_embedding = df['embedding'].iloc[0]
    if isinstance(first_embedding, np.ndarray):
        print(f"‚úÖ Embedding t√≠pusa: {type(first_embedding)}")
        print(f"‚úÖ Embedding dimenzi√≥ja: {len(first_embedding)}")
        print(f"‚úÖ Elv√°rt dimenzi√≥: {config.EMBEDDING_DIMENSION}")
        
        # L2 normaliz√°l√°s ellen≈ërz√©se
        norms = df['embedding'].apply(lambda x: np.linalg.norm(x) if isinstance(x, np.ndarray) else 1.0)
        print(f"\nNorm√°k statisztik√°i:")
        print(norms.describe())
        
        # Normaliz√°l√°s ellen≈ërz√©se (agents.md spec szerint k√∂telez≈ë)
        normalized_count = norms.apply(lambda x: abs(x - 1.0) < 0.01).sum()
        print(f"L2-normaliz√°lt embeddingek: {normalized_count}/{len(df)} ({100*normalized_count/len(df):.1f}%)")
        
        if normalized_count < len(df):
            print("‚ö†Ô∏è Nem minden embedding van L2-normaliz√°lva - ez probl√©m√°s lehet FAISS IP metrik√°n√°l")
        
        # Hi√°nyz√≥ embeddingek
        missing_embeddings = df['embedding'].isna().sum()
        print(f"Hi√°nyz√≥ embeddingek: {missing_embeddings}")
        
        # Embeddingek k√∂z√∂tti t√°vols√°gok elemz√©se
        valid_embeddings = df.dropna(subset=['embedding'])
        if len(valid_embeddings) > 100:
            X = np.vstack(valid_embeddings['embedding'].values)
            
            # V√©letlenszer≈±en kiv√°lasztott 1000 p√°r t√°vols√°ga
            n_pairs = min(1000, len(X) * (len(X) - 1) // 2)
            indices = np.random.choice(len(X), size=min(len(X), 100), replace=False)
            
            distances = []
            for i in range(len(indices)):
                for j in range(i + 1, len(indices)):
                    dist = np.linalg.norm(X[indices[i]] - X[indices[j]])
                    distances.append(dist)
            
            distances = np.array(distances)
            print(f"\nEmbeddingek k√∂z√∂tti √°tlagos t√°vols√°g: {distances.mean():.4f} ¬± {distances.std():.4f}")
            print(f"T√°vols√°g tartom√°ny: [{distances.min():.4f}, {distances.max():.4f}]")
            
            # Embedding s≈±r≈±s√©g
            print(f"Embedding dimenzi√≥: {X.shape[1]}")
            print(f"Adatpontok sz√°ma: {len(X)}")
            print(f"Adatpontok s≈±r≈±s√©ge: {len(X) / X.shape[1]:.2f}")
        else:
            print(f"‚ö†Ô∏è T√∫l kev√©s embedding a min≈ës√©gi metrik√°khoz: {len(valid_embeddings)}")
    else:
        print(f"‚ùå Embedding t√≠pusa nem megfelel≈ë: {type(first_embedding)}")
else:
    print("‚ùå Nincs embedding adat az elemz√©shez")

## 4. PCA Dimenzi√≥cs√∂kkent√©s √©s Vizualiz√°ci√≥

Az embeddingek 2D-s lek√©pez√©se PCA seg√≠ts√©g√©vel.

In [None]:
if df is not None and 'embedding' in df.columns and embeddings is not None:
    # Hi√°nyz√≥ embeddingek elt√°vol√≠t√°sa
    valid_df = df.dropna(subset=['embedding']).copy()
    
    if len(valid_df) > 100:  # Minimum 100 embedding PCA-hoz
        print(f"üìä PCA elemz√©s {len(valid_df)} embeddinggel...")
        
        # Embeddingek NumPy t√∂mbb√© alak√≠t√°sa
        X = np.vstack(valid_df['embedding'].values)
        print(f"PCA bemenet: {X.shape}")
        
        # PCA futtat√°sa
        pca = PCA(n_components=2)
        X_reduced = pca.fit_transform(X)
        
        print(f"PCA els≈ë k√©t komponens varianci√°ja: {pca.explained_variance_ratio_}")
        print(f"√ñsszes magyar√°zott variancia: {sum(pca.explained_variance_ratio_):.3f}")
        
        # PCA eredm√©ny vizualiz√°ci√≥
        plt.figure(figsize=(10, 7))
        plt.scatter(X_reduced[:, 0], X_reduced[:, 1], s=2, alpha=0.6)
        plt.title("EmbeddingGemma-300m - PCA 2D lek√©pez√©s")
        plt.xlabel("F≈ëkomponens 1")
        plt.ylabel("F≈ëkomponens 2")
        plt.grid(True, alpha=0.3)
        plt.show()
        
        # Jogter√ºlet szerinti sz√≠nez√©s
        if 'JogTerulet' in valid_df.columns:
            plt.figure(figsize=(12, 8))
            domains = valid_df['JogTerulet'].fillna('ismeretlen').values
            scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                                c=valid_df['JogTerulet'].astype('category').cat.codes, 
                                s=8, alpha=0.6, cmap='tab10')
            plt.title("EmbeddingGemma Embeddingek jogter√ºlet szerint sz√≠nezve")
            plt.xlabel("F≈ëkomponens 1")
            plt.ylabel("F≈ëkomponens 2")
            plt.legend(handles=scatter.legend_elements()[0], 
                      labels=valid_df['JogTerulet'].astype('category').cat.categories.tolist(), 
                      bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True, alpha=0.3)
            plt.show()
        
        # B√≠r√≥s√°g szerinti sz√≠nez√©s
        if 'birosag' in valid_df.columns:
            plt.figure(figsize=(12, 8))
            courts = valid_df['birosag'].fillna('ismeretlen').values
            scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                                c=valid_df['birosag'].astype('category').cat.codes, 
                                s=8, alpha=0.6, cmap='Set3')
            plt.title("EmbeddingGemma Embeddingek b√≠r√≥s√°g szerint sz√≠nezve")
            plt.xlabel("F≈ëkomponens 1")
            plt.ylabel("F≈ëkomponens 2")
            plt.legend(handles=scatter.legend_elements()[0], 
                      labels=valid_df['birosag'].astype('category').cat.categories.tolist(), 
                      bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True, alpha=0.3)
            plt.show()
        
        # Embedding hossza szerinti sz√≠nez√©s
        if 'karakter_szam' in valid_df.columns:
            plt.figure(figsize=(12, 8))
            scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                                c=valid_df['karakter_szam'], s=8, alpha=0.6, cmap='viridis')
            plt.title("EmbeddingGemma Embeddingek sz√∂veghossz szerint sz√≠nezve")
            plt.xlabel("F≈ëkomponens 1")
            plt.ylabel("F≈ëkomponens 2")
            plt.colorbar(scatter, label='Karakterek sz√°ma')
            plt.grid(True, alpha=0.3)
            plt.show()
    else:
        print(f"‚ö†Ô∏è T√∫l kev√©s embedding PCA-hoz: {len(valid_df)}")
else:
    print("‚ùå Nincs embedding adat a PCA-hoz")

## 5. Metadatok √©s Embeddingek Kapcsolata

Az embeddingek √©s a jogi metadatok k√∂z√∂tti kapcsolat elemz√©se.

In [None]:
if df is not None and 'embedding' in df.columns and not df.empty:
    valid_df = df.dropna(subset=['embedding'])
    
    if len(valid_df) > 0:
        print("üìä Metadatok √©s embeddingek kapcsolata:")
        
        # B√≠r√≥s√°g megoszl√°s
        if 'birosag' in valid_df.columns:
            print(f"\nB√≠r√≥s√°gok megoszl√°sa:")
            court_counts = valid_df['birosag'].value_counts()
            print(f"Top 10 b√≠r√≥s√°g: {court_counts.head(10).to_dict()}")
            
            plt.figure(figsize=(10, 6))
            court_counts.head(10).plot(kind='bar')
            plt.title('Top 10 leggyakoribb b√≠r√≥s√°g az embeddingekkel rendelkez≈ë chunkokban')
            plt.xlabel('B√≠r√≥s√°g')
            plt.ylabel('Chunkok sz√°ma')
            plt.xticks(rotation=45, ha='right')
            plt.grid(axis='y')
            plt.show()
        
        # Jogter√ºlet megoszl√°s
        if 'JogTerulet' in valid_df.columns:
            print(f"\nJogter√ºletek megoszl√°sa:")
            domain_counts = valid_df['JogTerulet'].value_counts()
            print(f"Top 10 jogter√ºlet: {domain_counts.head(10).to_dict()}")
            
            plt.figure(figsize=(10, 6))
            domain_counts.head(10).plot(kind='bar')
            plt.title('Top 10 leggyakoribb jogter√ºlet az embeddingekkel rendelkez≈ë chunkokban')
            plt.xlabel('Jogter√ºlet')
            plt.ylabel('Chunkok sz√°ma')
            plt.xticks(rotation=45, ha='right')
            plt.grid(axis='y')
            plt.show()
        
        # √âv szerinti megoszl√°s
        if 'HatarozatEve' in valid_df.columns:
            print(f"\nHat√°rozatok √©v szerinti megoszl√°sa:")
            year_counts = valid_df['HatarozatEve'].value_counts().sort_index()
            print(f"√âvek tartom√°ny: {year_counts.index.min()} - {year_counts.index.max()}")
            
            plt.figure(figsize=(12, 6))
            year_counts.plot(kind='line', marker='o')
            plt.title('Hat√°rozatok eloszl√°sa √©v szerint')
            plt.xlabel('√âv')
            plt.ylabel('Chunkok sz√°ma')
            plt.grid(True)
            plt.show()
        
        # Sz√∂veghossz eloszl√°s
        if 'text' in valid_df.columns:
            valid_df['text_length'] = valid_df['text'].astype(str).apply(len)
            print(f"\nSz√∂veghossz statisztik√°k:")
            print(valid_df['text_length'].describe())
            
            plt.figure(figsize=(10, 6))
            plt.hist(valid_df['text_length'], bins=50, alpha=0.7)
            plt.title('Chunk sz√∂veghossz eloszl√°sa')
            plt.xlabel('Karakterek sz√°ma')
            plt.ylabel('Chunkok sz√°ma')
            plt.grid(True)
            plt.show()
    else:
        print("‚ö†Ô∏è Nincs √©rv√©nyes embedding adat a metadatok elemz√©s√©hez")
else:
    print("‚ùå Nincs adat a metadatok elemz√©s√©hez")

## 6. FAISS Index Tulajdons√°gok

Az index t√≠pus√°nak √©s konfigur√°ci√≥j√°nak elemz√©se.

In [None]:
if index is not None:
    print("üîç FAISS Index tulajdons√°gok:")
    
    print(f"Index t√≠pusa: {type(index).__name__}")
    print(f"Vektorok sz√°ma: {index.ntotal}")
    print(f"Dimenzi√≥: {index.d}")
    
    # Index specifikus tulajdons√°gok
    if hasattr(index, 'nlist'):
        print(f"IVF lista sz√°m: {index.nlist}")
    if hasattr(index, 'nprobe'):
        print(f"Keres√©si pr√≥b√°k: {index.nprobe}")
    if hasattr(index, 'metric_type'):
        print(f"Metrika t√≠pusa: {index.metric_type}")
    
    # Index teljes√≠tm√©ny metrik√°k
    print(f"\nIndex m√©rete (becs√ºlt): {index.ntotal * index.d * 4 / (1024**2):.1f} MB")
    
    # Keres√©si sebess√©g becsl√©s (ha van adat)
    if embeddings is not None and len(embeddings) > 0:
        # Egyszer≈± keres√©si teszt
        query_embedding = embeddings[0].reshape(1, -1)
        k = 10
        
        import time
        start_time = time.time()
        distances, indices = index.search(query_embedding.astype(np.float32), k)
        search_time = time.time() - start_time
        
        print(f"Keres√©si teljes√≠tm√©ny (1 query, top-{k}): {search_time*1000:.2f}ms")
        print(f"√Åtlagos t√°vols√°g: {distances[0].mean():.4f}")
        print(f"T√°vols√°g sz√≥r√°s: {distances[0].std():.4f}")
    
    print("\n‚úÖ FAISS index elemz√©s k√©sz")
else:
    print("‚ùå Nincs FAISS index az elemz√©shez")

## 7. K√∂vetkeztet√©sek

Az embedding ki√©rt√©kel√©s √∂sszefoglal√°sa.

In [None]:
print("=== FAISS EMBEDDING ELEMZ√âS √ñSSZEFOGLAL√ì ===")
print("\n‚úÖ Sikeresen elemezve:")
if index is not None:
    print(f"   üìä FAISS index: {index.ntotal} vektor, {index.d} dimenzi√≥")
if chunk_id_map is not None:
    print(f"   üó∫Ô∏è Chunk ID mapping: {len(chunk_id_map)} bejegyz√©s")
if df is not None:
    print(f"   üìÑ Chunk adatok: {len(df)} chunk bet√∂ltve")
    if 'embedding' in df.columns:
        valid_count = df['embedding'].notna().sum()
        print(f"   üß† Embeddingek: {valid_count}/{len(df)} √©rv√©nyes")

print("\nüìã Agents.md specifik√°ci√≥ ellen≈ërz√©s:")
if index is not None and index.d == config.EMBEDDING_DIMENSION:
    print("   ‚úÖ Embedding dimenzi√≥ helyes")
else:
    print("   ‚ùå Embedding dimenzi√≥ elt√©r√©s")
    
# L2 normaliz√°l√°s ellen≈ërz√©se
if df is not None and 'embedding' in df.columns:
    valid_embeddings = df.dropna(subset=['embedding'])
    if len(valid_embeddings) > 0:
        norms = valid_embeddings['embedding'].apply(lambda x: np.linalg.norm(x))
        normalized_count = norms.apply(lambda x: abs(x - 1.0) < 0.01).sum()
        if normalized_count >= len(valid_embeddings) * 0.95:  # 95% k√ºsz√∂b
            print("   ‚úÖ L2 normaliz√°l√°s megfelel≈ë")
        else:
            print(f"   ‚ö†Ô∏è L2 normaliz√°l√°s hi√°nyos: {normalized_count}/{len(valid_embeddings)} ({100*normalized_count/len(valid_embeddings):.1f}%)")

print("\nüí° Aj√°nl√°sok:")
if df is not None and 'embedding' in df.columns:
    missing = df['embedding'].isna().sum()
    if missing > 0:
        print(f"   üîÑ Hi√°nyz√≥ embeddingek √∫jragener√°l√°sa: {missing} chunk")
if index is None:
    print("   üöÄ FAISS index gener√°l√°sa sz√ºks√©ges: gemma_embedding_runpod.ipynb")

print("\nüéØ Elemz√©s k√©sz - a retrieval rendszer haszn√°latra k√©sz!")