# Qwen3 Embeddingek EDA - CourtRankRL projekt

Ez a notebook a qwen_embedding_runpod.ipynb √°ltal gener√°lt FAISS indexben t√°rolt embeddingeket elemzi.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import json
import faiss
from pathlib import Path

# Plot st√≠lus be√°ll√≠t√°sa
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Projekt konfigur√°ci√≥ bet√∂lt√©se
import sys
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
try:
    from configs import config
except ImportError:
    print("Figyelem: configs modul nem tal√°lhat√≥. Haszn√°lja a default √©rt√©keket.")
    class Config:
        FAISS_INDEX_PATH = project_root / "data" / "index" / "faiss_index.bin"
        CHUNK_ID_MAP_PATH = project_root / "data" / "index" / "chunk_id_map.json"
    config = Config()

print("FAISS index √©s chunk adatok bet√∂lt√©se...")

## 1. FAISS Index bet√∂lt√©se √©s embeddingek kivon√°sa

In [None]:
# FAISS index bet√∂lt√©se
faiss_path = config.FAISS_INDEX_PATH
chunk_map_path = config.CHUNK_ID_MAP_PATH

if faiss_path and Path(faiss_path).exists():
    try:
        index = faiss.read_index(str(faiss_path))
        print(f"‚úÖ FAISS index bet√∂ltve: {index.ntotal} vektor, {index.d} dimenzi√≥")

        # Embeddingek kivon√°sa (minden vektor)
        embeddings = []
        for i in range(index.ntotal):
            embedding = index.reconstruct(i)
            embeddings.append(embedding)

        embeddings = np.array(embeddings)
        print(f"‚úÖ Embeddingek kivonva: {embeddings.shape}")

    except Exception as e:
        print(f"‚ùå Hiba a FAISS index bet√∂lt√©se sor√°n: {e}")
        index = None
        embeddings = None
else:
    print(f"‚ö†Ô∏è FAISS index nem tal√°lhat√≥: {faiss_path}")
    print("Futtassa a qwen_embedding_runpod.ipynb-t el≈ësz√∂r!")

# Chunk ID mapping bet√∂lt√©se
if chunk_map_path and Path(chunk_map_path).exists():
    try:
        with open(chunk_map_path, 'r', encoding='utf-8') as f:
            chunk_id_map = json.load(f)
        print(f"‚úÖ Chunk ID map bet√∂ltve: {len(chunk_id_map)} mapping")
    except Exception as e:
        print(f"‚ùå Hiba a chunk ID map bet√∂lt√©se sor√°n: {e}")
        chunk_id_map = None
else:
    print(f"‚ö†Ô∏è Chunk ID map nem tal√°lhat√≥: {chunk_map_path}")

## 2. Chunk adatok bet√∂lt√©se

In [None]:
# Chunkok bet√∂lt√©se mintav√©telez√©ssel
df = None
chunks_file = getattr(config, 'CHUNKS_JSONL', None)

if chunks_file and Path(chunks_file).exists() and index is not None and chunk_id_map is not None:
    try:
        sample_size = min(5000, index.ntotal)  # Maximum 5000 chunk elemz√©sre
        print(f"üìä Mintav√©tel: {sample_size} chunk bet√∂lt√©se...")

        chunks_list = []
        chunk_ids = list(chunk_id_map.values())[:sample_size]

        with open(chunks_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    chunk = json.loads(line.strip())
                    if chunk.get('chunk_id') in chunk_ids:
                        chunks_list.append(chunk)
                except json.JSONDecodeError:
                    continue
                if len(chunks_list) >= sample_size:
                    break

        if chunks_list:
            df = pd.DataFrame(chunks_list)
            print(f"‚úÖ Bet√∂lt√∂tt chunkok sz√°ma: {len(df)}")

            # Embeddingek hozz√°rendel√©se a chunkokhoz
            if embeddings is not None:
                try:
                    embedding_dict = {chunk_id_map[str(i)]: embeddings[i] for i in range(len(embeddings))}
                    df = df.assign(embedding=df['chunk_id'].map(embedding_dict))
                    valid_embeddings = df['embedding'].notna().sum()
                    print(f"‚úÖ Embeddingek hozz√°rendelve: {valid_embeddings} chunk")
                except Exception as e:
                    print(f"‚ùå Hiba az embeddingek hozz√°rendel√©se sor√°n: {e}")
            else:
                print("‚ö†Ô∏è Nincsenek embeddingek a hozz√°rendel√©shez")
        else:
            print("‚ö†Ô∏è Nem tal√°lhat√≥ak megfelel≈ë chunkok")

    except Exception as e:
        print(f"‚ùå Hiba a chunkok bet√∂lt√©se sor√°n: {e}")
        df = None
else:
    print("‚ö†Ô∏è Hi√°nyz√≥ adatok a chunk bet√∂lt√©shez:")
    if not chunks_file:
        print("  - CHUNKS_JSONL konfigur√°ci√≥")
    if not Path(chunks_file).exists() if chunks_file else None:
        print(f"  - F√°jl nem tal√°lhat√≥: {chunks_file}")
    if index is None:
        print("  - FAISS index")
    if chunk_id_map is None:
        print("  - Chunk ID map")

## 3. Embeddingek alapellen≈ërz√©se

In [None]:
if not df.empty and 'embedding' in df.columns:
    print("Embeddingek alapvet≈ë statisztik√°i:")
    
    # Els≈ë embedding t√≠pusa √©s dimenzi√≥ja
    first_embedding = df['embedding'].iloc[0]
    print(f"Els≈ë embedding t√≠pusa: {type(first_embedding)}")
    print(f"Els≈ë embedding dimenzi√≥ja: {len(first_embedding)}")
    
    # Embeddingek norm√°i (L2 normaliz√°l√°s ellen≈ërz√©se)
    norms = df['embedding'].apply(lambda x: np.linalg.norm(x))
    print(f"Embedding norm√°k statisztik√°i:")
    print(norms.describe())
    
    # Normaliz√°l√°s ellen≈ërz√©se
    normalized_count = norms.apply(lambda x: abs(x - 1.0) < 0.01).sum()
    print(f"L2-normaliz√°lt embeddingek: {normalized_count}/{len(df)} ({100*normalized_count/len(df):.1f}%)")
    
    # Hi√°nyz√≥ embeddingek
    missing_embeddings = df['embedding'].isna().sum()
    print(f"Hi√°nyz√≥ embeddingek: {missing_embeddings}")
    
else:
    print("Nincs embedding adat az elemz√©shez.")

## 4. PCA dimenzi√≥cs√∂kkent√©s √©s vizualiz√°ci√≥

In [None]:
if not df.empty and 'embedding' in df.columns:
    # Hi√°nyz√≥ embeddingek elt√°vol√≠t√°sa
    valid_df = df.dropna(subset=['embedding']).copy()
    
    if len(valid_df) > 100:  # Minimum 100 embedding PCA-hoz
        # Embeddingek NumPy t√∂mbb√© alak√≠t√°sa
        X = np.vstack(valid_df['embedding'].values)
        print(f"PCA bemenet: {X.shape}")
        
        # PCA futtat√°sa
        pca = PCA(n_components=2)
        X_reduced = pca.fit_transform(X)
        
        print(f"PCA els≈ë k√©t komponens varianci√°ja: {pca.explained_variance_ratio_}")
        print(f"√ñsszes magyar√°zott variancia: {sum(pca.explained_variance_ratio_):.3f}")
        
        # PCA eredm√©ny vizualiz√°ci√≥
        plt.figure(figsize=(10, 7))
        plt.scatter(X_reduced[:, 0], X_reduced[:, 1], s=2, alpha=0.6)
        plt.title("Qwen3 Embeddingek PCA 2D lek√©pez√©se")
        plt.xlabel("F≈ëkomponens 1")
        plt.ylabel("F≈ëkomponens 2")
        plt.grid(True, alpha=0.3)
        plt.show()
        
        # Jogter√ºlet szerinti sz√≠nez√©s
        if 'domain' in valid_df.columns:
            plt.figure(figsize=(12, 8))
            domains = valid_df['domain'].fillna('ismeretlen').values
            scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=valid_df['domain'].astype('category').cat.codes, s=8, alpha=0.6, cmap='tab10')
            plt.title("Qwen3 Embeddingek jogter√ºlet szerint sz√≠nezve")
            plt.xlabel("F≈ëkomponens 1")
            plt.ylabel("F≈ëkomponens 2")
            plt.legend(handles=scatter.legend_elements()[0], labels=valid_df['domain'].astype('category').cat.categories.tolist(), bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True, alpha=0.3)
            plt.show()
        
        # B√≠r√≥s√°g szerinti sz√≠nez√©s
        if 'court' in valid_df.columns:
            plt.figure(figsize=(12, 8))
            courts = valid_df['court'].fillna('ismeretlen').values
            scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=valid_df['court'].astype('category').cat.codes, s=8, alpha=0.6, cmap='Set3')
            plt.title("Qwen3 Embeddingek b√≠r√≥s√°g szerint sz√≠nezve")
            plt.xlabel("F≈ëkomponens 1")
            plt.ylabel("F≈ëkomponens 2")
            plt.legend(handles=scatter.legend_elements()[0], labels=valid_df['court'].astype('category').cat.categories.tolist(), bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True, alpha=0.3)
            plt.show()
    else:
        print(f"T√∫l kev√©s embedding PCA-hoz: {len(valid_df)}")
else:
    print("Nincs embedding adat a PCA-hoz.")

## 5. Embedding min≈ës√©gi metrik√°k

In [None]:
if not df.empty and 'embedding' in df.columns:
    valid_df = df.dropna(subset=['embedding']).copy()
    
    if len(valid_df) > 10:
        X = np.vstack(valid_df['embedding'].values)
        
        print("Embedding min≈ës√©gi metrik√°k:")
        
        # √Åtlagos norma (m√°r ellen≈ërizt√ºk)
        norms = np.linalg.norm(X, axis=1)
        print(f"Norm√°k √°tlaga: {norms.mean():.4f}")
        print(f"Norm√°k sz√≥r√°sa: {norms.std():.4f}")
        
        # Embeddingek k√∂z√∂tti t√°vols√°gok
        # V√©letlenszer≈±en kiv√°lasztott 1000 p√°r t√°vols√°ga
        n_pairs = min(1000, len(X) * (len(X) - 1) // 2)
        indices = np.random.choice(len(X), size=min(len(X), 100), replace=False)
        
        distances = []
        for i in range(len(indices)):
            for j in range(i + 1, len(indices)):
                dist = np.linalg.norm(X[indices[i]] - X[indices[j]])
                distances.append(dist)
        
        distances = np.array(distances)
        print(f"Embeddingek k√∂z√∂tti √°tlagos t√°vols√°g: {distances.mean():.4f}")
        print(f"Embeddingek k√∂z√∂tti t√°vols√°g sz√≥r√°sa: {distances.std():.4f}")
        
        # Legk√∂zelebbi √©s legt√°volabbi pontok
        print(f"Legkisebb t√°vols√°g: {distances.min():.4f}")
        print(f"Legnagyobb t√°vols√°g: {distances.max():.4f}")
        
        # Embeddingek s≈±r≈±s√©ge
        print(f"\nEmbedding dimenzi√≥: {X.shape[1]}")
        print(f"Adatpontok sz√°ma: {len(X)}")
        print(f"Adatpontok s≈±r≈±s√©ge: {len(X) / X.shape[1]:.2f}")
        
    else:
        print(f"T√∫l kev√©s embedding a min≈ës√©gi metrik√°khoz: {len(valid_df)}")
else:
    print("Nincs embedding adat a min≈ës√©gi metrik√°khoz.")

## 6. K√∂vetkeztet√©sek