## QA Chain Test with Current Architecture

Ez a notebook a jelenlegi RAG pipeline-t teszteli lokális sample adatokkal. Az aktuális architektúrával kompatibilis:
- **Google Gemini** embeddings és LLM
- **Custom és Reranking Retriever** implementáció  
- **Lokális FAISS index** és DataFrame-ek

**Setup:** 
1. `conda activate legalqa`
2. `python scripts/create_sample.py`
3. `python scripts/build_local_index.py`
4. Állítsd be a `.env` fájlt a `GOOGLE_API_KEY`-jel

In [1]:
# Import necessary packages
import asyncio
import os
import pickle
from pathlib import Path
from typing import List, Any
import textwrap

import pandas as pd
import faiss
import numpy as np
from dotenv import load_dotenv

# Import current architecture components
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from pydantic import SecretStr

# Import our current components
from src.chain.qa_chain import build_qa_chain
from src.data_loading.faiss_loader import load_faiss_index

# Load environment variables
load_dotenv()

# Change to project root if needed
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

print("✅ Imports completed!")
print(f"📁 Working directory: {os.getcwd()}")

  from .autonotebook import tqdm as notebook_tqdm


✅ Imports completed!
📁 Working directory: /Users/zelenyianszkimate/Documents/LegalQA_v2


In [2]:
# Load sample data using current architecture
print("🔧 Loading sample data with current architecture...")

# Get paths for sample data (same as build_local_index.py)
sample_parquet_path = os.getenv("NOTEBOOK_PARQUET_PATH", "data/processed/sample_data.parquet")
faiss_index_path = os.getenv("NOTEBOOK_FAISS_PATH", "data/processed/sample_faiss.bin") 
id_mapping_path = os.getenv("NOTEBOOK_ID_MAPPING_PATH", "data/processed/sample_mapping.pkl")

# Verify files exist
required_files = [sample_parquet_path, faiss_index_path, id_mapping_path]
missing_files = [f for f in required_files if not os.path.exists(f)]

if missing_files:
    print("❌ Missing required files:")
    for f in missing_files:
        print(f"  - {f}")
    print("\n🔨 Please run these commands first:")
    print("  python scripts/create_sample.py")
    print("  python scripts/build_local_index.py")
    raise FileNotFoundError("Required sample files not found")

# Load data using current architecture
print(f"📊 Loading sample data from: {sample_parquet_path}")
df = pd.read_parquet(sample_parquet_path)

print(f"🔍 Loading FAISS index from: {faiss_index_path}")
faiss_index, id_mapping = load_faiss_index(faiss_index_path, id_mapping_path)

print(f"✅ Data loaded successfully!")
print(f"  📄 Documents: {len(df)}")
print(f"  🔍 FAISS vectors: {faiss_index.ntotal}")
print(f"  🗂️ ID mappings: {len(id_mapping)}")

# Initialize Google Gemini embeddings (current architecture)
google_api_key = os.getenv('GOOGLE_API_KEY')
if not google_api_key:
    raise ValueError("GOOGLE_API_KEY environment variable is required!")

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", 
    api_key=google_api_key
)

print("🤖 Google Gemini embeddings initialized!")

🔧 Loading sample data with current architecture...
📊 Loading sample data from: data/processed/sample_data.parquet
🔍 Loading FAISS index from: data/processed/sample_faiss.bin
✅ Data loaded successfully!
  📄 Documents: 8293
  🔍 FAISS vectors: 8293
  🗂️ ID mappings: 8293
🤖 Google Gemini embeddings initialized!


In [3]:
# Create a simplified local retriever that works with DataFrames (for notebook testing)
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from sklearn.metrics.pairwise import cosine_similarity

class NotebookLocalRetriever(BaseRetriever):
    """Simplified local retriever for notebook testing with current architecture."""
    
    embeddings: Any
    faiss_index: Any
    id_mapping: dict
    documents_df: Any
    k: int = 5
    
    class Config:
        arbitrary_types_allowed = True
    
    def _get_relevant_documents(
        self, 
        query: str, 
        *, 
        run_manager: CallbackManagerForRetrieverRun = None
    ) -> List[Document]:
        print(f"\n🔍 [DEBUG] Retrieving for query: '{query}'")
        
        # Get query embedding
        print("🔍 [DEBUG] Getting query embedding...")
        query_embedding = self.embeddings.embed_query(query)
        query_vector = np.array([query_embedding], dtype='float32')
        print(f"🔍 [DEBUG] Query embedding shape: {query_vector.shape}")
        
        # Search FAISS index
        print(f"🔍 [DEBUG] Searching FAISS index for top {self.k} matches...")
        distances, indices = self.faiss_index.search(query_vector, self.k)
        print(f"🔍 [DEBUG] Found indices: {indices[0]}")
        print(f"🔍 [DEBUG] Distances: {distances[0]}")
        
        # Convert to documents
        documents = []
        print(f"🔍 [DEBUG] Converting to documents...")
        
        for i, idx in enumerate(indices[0]):
            if idx in self.id_mapping:
                chunk_id = self.id_mapping[idx]
                
                # Find the row in DataFrame
                row = self.documents_df[self.documents_df['chunk_id'] == chunk_id]
                if not row.empty:
                    text_content = row.iloc[0]['text_chunk']  # Correct column name
                    doc_id = row.iloc[0]['doc_id']
                    
                    print(f"🔍 [DEBUG] Doc {i+1}: {doc_id} (distance: {distances[0][i]:.4f})")
                    print(f"🔍 [DEBUG] Text preview: {text_content[:100]}...")
                    
                    # Calculate scores (simplified version)
                    relevance_score = 1.0 / (1.0 + distances[0][i])
                    
                    metadata = {
                        'chunk_id': chunk_id, 
                        'doc_id': doc_id, 
                        'distance': float(distances[0][i]),
                        'relevancia': round(relevance_score, 3),
                        'final_score': round(relevance_score, 4)
                    }
                    
                    documents.append(Document(
                        page_content=text_content,
                        metadata=metadata
                    ))
                    
                    # Check if it contains the search term
                    search_terms = ['bűnszervezet', 'btk', 'három személy']
                    found_terms = [term for term in search_terms if term.lower() in text_content.lower()]
                    if found_terms:
                        print(f"🎯 [DEBUG] FOUND relevant terms: {found_terms}")
                    
        print(f"🔍 [DEBUG] Final result: {len(documents)} documents retrieved")
        return documents

# Initialize the notebook retriever
local_retriever = NotebookLocalRetriever(
    embeddings=embeddings,
    faiss_index=faiss_index,
    id_mapping=id_mapping,
    documents_df=df,
    k=5
)

print("✅ Notebook retriever initialized!")

✅ Notebook retriever initialized!


In [4]:
# Test the retriever directly with a legal question
test_query = "Mi a bűnszervezet fogalma a Btk. szerint?"

print(f"🧪 Testing retriever with query: '{test_query}'")
print("=" * 70)

# Test retrieval
retrieved_docs = local_retriever._get_relevant_documents(test_query)

print(f"\n📋 RETRIEVAL RESULTS:")
print(f"Found {len(retrieved_docs)} documents")

for i, doc in enumerate(retrieved_docs, 1):
    print(f"\n📄 Document {i}:")
    print(f"  🆔 Doc ID: {doc.metadata['doc_id']}")
    print(f"  📊 Distance: {doc.metadata['distance']:.4f}")
    print(f"  📊 Relevance: {doc.metadata['relevancia']:.3f}")
    print(f"  📝 Text preview: {doc.page_content[:200]}...")
    
    # Check for key terms
    content_lower = doc.page_content.lower()
    if 'bűnszervezet' in content_lower:
        print("  ✅ Contains 'bűnszervezet'!")
    if 'btk' in content_lower:
        print("  ✅ Contains 'Btk'!")
    if 'három' in content_lower and 'személy' in content_lower:
        print("  ✅ Contains 'három személy'!")

print("\n" + "=" * 70)


🧪 Testing retriever with query: 'Mi a bűnszervezet fogalma a Btk. szerint?'

🔍 [DEBUG] Retrieving for query: 'Mi a bűnszervezet fogalma a Btk. szerint?'
🔍 [DEBUG] Getting query embedding...
🔍 [DEBUG] Query embedding shape: (1, 768)
🔍 [DEBUG] Searching FAISS index for top 5 matches...
🔍 [DEBUG] Found indices: [1694 2312 7147 8249 4845]
🔍 [DEBUG] Distances: [0.68659014 0.6933098  0.693562   0.695029   0.6973957 ]
🔍 [DEBUG] Converting to documents...
🔍 [DEBUG] Doc 1: P.20961/2011/3 (distance: 0.6866)
🔍 [DEBUG] Text preview: a 2 cég neve Bt.-t képviselte. A cégnek 7 millió 680 ezer forintot fizettek ki, II. r. felperes a pr...
🔍 [DEBUG] Doc 2: Bf.11086/2012/6 (distance: 0.6933)
🔍 [DEBUG] Text preview: a 2009. június hó 30. napján kelt határozatával rendelte el a vádlott vezetési jogosultságának szüne...
🔍 [DEBUG] Doc 3: B.1642/2008/306 (distance: 0.6936)
🔍 [DEBUG] Text preview: a ... Bt-nek a tartozást a tulajdonát képező készpénzből, azt alátámasztotta VII.rendű vádlott neve ...
🔍 [DEBUG]

In [5]:
# Now test the full QA chain with current architecture
print("🔗 Building QA chain with current architecture...")

# Load prompts (current approach)
prompt_path = Path("src/prompts/legal_assistant_prompt.txt")
if not prompt_path.exists():
    raise FileNotFoundError(f"Prompt file not found: {prompt_path}")

template = prompt_path.read_text(encoding="utf-8")
prompt = PromptTemplate.from_template(template)

# Create LLM (current approach)  
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0,
    api_key=SecretStr(google_api_key),
)

print("✅ LLM and prompt loaded!")

# Helper function to format docs (from current qa_chain.py)
def format_docs(docs: List[Document]) -> str:
    """Format documents for the prompt."""
    lines = []
    for doc in docs:
        chunk_id = doc.metadata.get("chunk_id", "N/A")
        doc_id = doc.metadata.get("doc_id", "N/A")
        content = doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content
        
        lines.append(f"### Document ID: {doc_id}\nContent:\n{content}")
    
    return "\n\n".join(lines)

print("✅ Helper functions ready!")


🔗 Building QA chain with current architecture...
✅ LLM and prompt loaded!
✅ Helper functions ready!


In [6]:
# Test the complete QA pipeline
test_query = "Milyen büntetés várható kifosztás esetén?"

print(f"🚀 Testing complete QA pipeline")
print(f"❓ Query: {test_query}")
print("=" * 70)

# Step 1: Retrieve documents
print("\n📖 Step 1: Document Retrieval")
docs = local_retriever._get_relevant_documents(test_query)
print(f"Retrieved {len(docs)} documents")

# Step 2: Format context
print("\n📝 Step 2: Context Formatting")
context = format_docs(docs)
print(f"Context length: {len(context)} characters")
print(f"Context preview:\n{context[:300]}...")

# Step 3: Generate answer
print("\n🤖 Step 3: LLM Answer Generation")
formatted_input = prompt.format(context=context, question=test_query)

try:
    result = llm.invoke(formatted_input)
    answer = result.content if hasattr(result, 'content') else str(result)
    
    print(f"\n✅ FINAL ANSWER:")
    print("=" * 50)
    print(textwrap.fill(answer, width=80))
    print("=" * 50)
    
except Exception as e:
    print(f"❌ Error during LLM invocation: {e}")
    print("Context being sent to LLM:")
    print(formatted_input[:500] + "...")


🚀 Testing complete QA pipeline
❓ Query: Milyen büntetés várható kifosztás esetén?

📖 Step 1: Document Retrieval

🔍 [DEBUG] Retrieving for query: 'Milyen büntetés várható kifosztás esetén?'
🔍 [DEBUG] Getting query embedding...
🔍 [DEBUG] Query embedding shape: (1, 768)
🔍 [DEBUG] Searching FAISS index for top 5 matches...
🔍 [DEBUG] Found indices: [2274 8244  741 3854 1901]
🔍 [DEBUG] Distances: [0.65831125 0.66331756 0.6655626  0.67108667 0.6716225 ]
🔍 [DEBUG] Converting to documents...
🔍 [DEBUG] Doc 1: B.1492/2010/306 (distance: 0.6583)
🔍 [DEBUG] Text preview: kolléga 50 milliós letéti pénz jogellenes kifizetési miatt újabb kamarai eljárást is kezdeményezhet ...
🔍 [DEBUG] Doc 2: K.21403/2010/19 (distance: 0.6633)
🔍 [DEBUG] Text preview: alapján. Ekként a tűzifa értékesítés és a faanyagok beszerzési értékének arányát 1,15-ben számolta e...
🔍 [DEBUG] Doc 3: K.700486/2020/23 (distance: 0.6656)
🔍 [DEBUG] Text preview: bizonyította, hogy a bányatelken történő kitermelésre a bányakapitányság ál

In [10]:
# Test with multiple queries to verify functionality
test_queries = [
    "Mi a bűnszervezet fogalma a Btk. szerint?",
    "Milyen feltételei vannak a bűnszervezetben való részvételnek?",
    "Mi a különbség az alperes és a felperes között?",
    "Mikor alkalmazható a feltételes szabadság?",
    "Mit jelent a bizonyítási teher?"
]

print("🧪 Testing multiple queries...")
print("=" * 70)

for i, query in enumerate(test_queries, 1):
    print(f"\n📝 Test {i}/5: {query}")
    
    # Quick retrieval test
    docs = local_retriever._get_relevant_documents(query)
    
    if docs:
        best_doc = docs[0]
        doc_id = best_doc.metadata.get('doc_id', 'N/A')
        distance = best_doc.metadata.get('distance', 'N/A')
        
        print(f"  ✅ Best match: {doc_id} (distance: {distance:.4f})")
        print(f"  📄 Preview: {best_doc.page_content[:150]}...")
        
        # Check relevance
        query_words = query.lower().split()
        content_words = best_doc.page_content.lower().split()
        common_words = set(query_words) & set(content_words)
        if common_words:
            print(f"  🎯 Common terms: {list(common_words)[:3]}")
    else:
        print(f"  ❌ No documents found")
    
    print("-" * 50)

print("\n✅ Multi-query testing completed!")


🧪 Testing multiple queries...

📝 Test 1/5: Mi a bűnszervezet fogalma a Btk. szerint?

🔍 [DEBUG] Retrieving for query: 'Mi a bűnszervezet fogalma a Btk. szerint?'
🔍 [DEBUG] Getting query embedding...
🔍 [DEBUG] Query embedding shape: (1, 768)
🔍 [DEBUG] Searching FAISS index for top 5 matches...
🔍 [DEBUG] Found indices: [1694 2312 7147 8249 4845]
🔍 [DEBUG] Distances: [0.68659014 0.6933098  0.693562   0.695029   0.6973957 ]
🔍 [DEBUG] Converting to documents...
🔍 [DEBUG] Doc 1: P.20961/2011/3 (distance: 0.6866)
🔍 [DEBUG] Text preview: a 2 cég neve Bt.-t képviselte. A cégnek 7 millió 680 ezer forintot fizettek ki, II. r. felperes a pr...
🔍 [DEBUG] Doc 2: Bf.11086/2012/6 (distance: 0.6933)
🔍 [DEBUG] Text preview: a 2009. június hó 30. napján kelt határozatával rendelte el a vádlott vezetési jogosultságának szüne...
🔍 [DEBUG] Doc 3: B.1642/2008/306 (distance: 0.6936)
🔍 [DEBUG] Text preview: a ... Bt-nek a tartozást a tulajdonát képező készpénzből, azt alátámasztotta VII.rendű vádlott neve ...


In [8]:
# Performance and debugging summary
print("📊 PERFORMANCE SUMMARY")
print("=" * 50)

print(f"📄 Sample dataset size: {len(df)} documents")
print(f"🔍 FAISS index size: {faiss_index.ntotal} vectors")
print(f"🗂️ ID mapping entries: {len(id_mapping)}")

# Check first few mappings
print(f"\n🔍 Sample ID mappings:")
for i, (faiss_idx, chunk_id) in enumerate(list(id_mapping.items())[:3]):
    print(f"  FAISS[{faiss_idx}] -> {chunk_id}")

# Check document distribution
print(f"\n📊 Document statistics:")
if 'doc_id' in df.columns:
    unique_docs = df['doc_id'].nunique()
    print(f"  Unique documents: {unique_docs}")
    print(f"  Avg chunks per doc: {len(df) / unique_docs:.1f}")

if 'text_chunk' in df.columns:
    text_lengths = df['text_chunk'].str.len()
    print(f"  Avg text length: {text_lengths.mean():.0f} chars")
    print(f"  Min text length: {text_lengths.min()} chars")
    print(f"  Max text length: {text_lengths.max()} chars")

print(f"\n🤖 Model configuration:")
print(f"  Embeddings: {embeddings.model}")
print(f"  LLM: {llm.model}")
print(f"  Temperature: {llm.temperature}")

print(f"\n✅ Notebook testing setup complete!")
print(f"🎯 Ready for production-like RAG testing!")


📊 PERFORMANCE SUMMARY
📄 Sample dataset size: 8293 documents
🔍 FAISS index size: 8293 vectors
🗂️ ID mapping entries: 8293

🔍 Sample ID mappings:
  FAISS[0] -> P.20693/2011/64-2
  FAISS[1] -> Pf.20055/2020/5-2
  FAISS[2] -> B.101/2021/25-1

📊 Document statistics:
  Unique documents: 8030
  Avg chunks per doc: 1.0
  Avg text length: 6961 chars
  Min text length: 213 chars
  Max text length: 8000 chars

🤖 Model configuration:
  Embeddings: models/text-embedding-004
  LLM: models/gemini-2.5-pro
  Temperature: 0.0

✅ Notebook testing setup complete!
🎯 Ready for production-like RAG testing!


In [9]:
# Optional: Test with simplified reranking
print("🔄 Optional: Testing with simplified reranking...")

def simple_rerank_by_keyword_boost(docs: List[Document], query: str, top_k: int = 3) -> List[Document]:
    """Simplified reranking based on keyword matching and distance."""
    
    # Extract key terms from query
    query_terms = set(query.lower().split())
    
    scored_docs = []
    for doc in docs:
        base_score = doc.metadata.get('relevancia', 0)
        
        # Keyword boost
        content_lower = doc.page_content.lower()
        term_matches = sum(1 for term in query_terms if term in content_lower)
        keyword_boost = term_matches * 0.1
        
        # Special legal term boost
        legal_boost = 0
        if 'btk' in content_lower:
            legal_boost += 0.2
        if 'bűnszervezet' in content_lower:
            legal_boost += 0.3
        if 'törvény' in content_lower:
            legal_boost += 0.1
            
        final_score = base_score + keyword_boost + legal_boost
        doc.metadata['reranker_score'] = round(final_score, 4)
        scored_docs.append(doc)
    
    # Sort by final score
    scored_docs.sort(key=lambda d: d.metadata['reranker_score'], reverse=True)
    return scored_docs[:top_k]

# Test reranking
query = "Mi a bűnszervezet fogalma a Btk. szerint?"
docs = local_retriever._get_relevant_documents(query)

print(f"📋 Before reranking: {len(docs)} docs")
reranked_docs = simple_rerank_by_keyword_boost(docs, query, top_k=3)

print(f"📋 After reranking: {len(reranked_docs)} docs")
for i, doc in enumerate(reranked_docs, 1):
    print(f"\n🏆 Rank {i}:")
    print(f"  Doc ID: {doc.metadata['doc_id']}")
    print(f"  Original score: {doc.metadata['relevancia']}")
    print(f"  Reranker score: {doc.metadata['reranker_score']}")
    print(f"  Text: {doc.page_content[:100]}...")

print("\n✅ Simplified reranking test completed!")


🔄 Optional: Testing with simplified reranking...

🔍 [DEBUG] Retrieving for query: 'Mi a bűnszervezet fogalma a Btk. szerint?'
🔍 [DEBUG] Getting query embedding...
🔍 [DEBUG] Query embedding shape: (1, 768)
🔍 [DEBUG] Searching FAISS index for top 5 matches...
🔍 [DEBUG] Found indices: [1694 2312 7147 8249 4845]
🔍 [DEBUG] Distances: [0.68659014 0.6933098  0.693562   0.695029   0.6973957 ]
🔍 [DEBUG] Converting to documents...
🔍 [DEBUG] Doc 1: P.20961/2011/3 (distance: 0.6866)
🔍 [DEBUG] Text preview: a 2 cég neve Bt.-t képviselte. A cégnek 7 millió 680 ezer forintot fizettek ki, II. r. felperes a pr...
🔍 [DEBUG] Doc 2: Bf.11086/2012/6 (distance: 0.6933)
🔍 [DEBUG] Text preview: a 2009. június hó 30. napján kelt határozatával rendelte el a vádlott vezetési jogosultságának szüne...
🔍 [DEBUG] Doc 3: B.1642/2008/306 (distance: 0.6936)
🔍 [DEBUG] Text preview: a ... Bt-nek a tartozást a tulajdonát képező készpénzből, azt alátámasztotta VII.rendű vádlott neve ...
🔍 [DEBUG] Doc 4: B.120/2016/21 (dist