## QA Chain Test with Local Sample Data

This notebook tests the full Question-Answering chain using a small, local sample of the data. This avoids high memory usage and allows for quick experimentation.

**Setup required:** Before running, ensure your `.env` file is configured and you have created the necessary sample files by running these commands from the project root:
1. `make setup-dev`
2. `make setup-notebooks`

In [1]:
# Import necessary packages
from dotenv import load_dotenv
import os
import pandas as pd
import pickle
import faiss
import textwrap

# Import LangChain and OpenAI components
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from typing import List, Any
import numpy as np

# Import our custom components
from src.chain.qa_chain import build_qa_chain

# Load environment variables from .env file
load_dotenv()

# Set working directory to project root
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

# Get paths from environment variables
parquet_path = os.getenv('NOTEBOOK_PARQUET_PATH', 'data/processed/sample_data.parquet')
faiss_path = os.getenv('NOTEBOOK_FAISS_PATH', 'data/processed/sample_faiss.bin')
id_mapping_path = os.getenv('NOTEBOOK_ID_MAPPING_PATH', 'data/processed/sample_mapping.pkl')

# Load data and models from local sample files
print(f"Loading data from: {parquet_path}")
df = pd.read_parquet(parquet_path)
print(f"Loading FAISS index from: {faiss_path}")
faiss_index = faiss.read_index(faiss_path)
print(f"Loading ID mapping from: {id_mapping_path}")
with open(id_mapping_path, 'rb') as f:
    id_mapping = pickle.load(f)

print(f"📊 Loaded {len(df)} documents")
print(f"📊 FAISS index has {faiss_index.ntotal} vectors")
print(f"📊 ID mapping has {len(id_mapping)} entries")

# Initialize embeddings model
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))

# Create a simple local retriever with DEBUG info
class LocalRetrieverDebug(BaseRetriever):
    """Local retriever with debug information."""
    
    embeddings: Any
    faiss_index: Any
    id_mapping: dict
    documents_df: Any
    k: int = 5
    
    class Config:
        arbitrary_types_allowed = True
    
    def _get_relevant_documents(
        self, 
        query: str, 
        *, 
        run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        print(f"\n🔍 [DEBUG] Starting retrieval for query: '{query}'")
        
        # Get query embedding
        print("🔍 [DEBUG] Getting query embedding...")
        query_embedding = self.embeddings.embed_query(query)
        query_vector = np.array([query_embedding], dtype='float32')
        print(f"🔍 [DEBUG] Query embedding shape: {query_vector.shape}")
        
        # Search FAISS index
        print(f"🔍 [DEBUG] Searching FAISS index for top {self.k} matches...")
        distances, indices = self.faiss_index.search(query_vector, self.k)
        print(f"🔍 [DEBUG] Found indices: {indices[0]}")
        print(f"🔍 [DEBUG] Distances: {distances[0]}")
        
        # Convert to documents
        documents = []
        print(f"🔍 [DEBUG] Converting {len(indices[0])} indices to documents...")
        
        for i, idx in enumerate(indices[0]):
            print(f"🔍 [DEBUG] Processing index {i+1}/{len(indices[0])}: {idx}")
            
            if idx in self.id_mapping:
                chunk_id = self.id_mapping[idx]
                print(f"🔍 [DEBUG] Mapped to chunk_id: {chunk_id}")
                
                # Find the row in DataFrame
                row = self.documents_df[self.documents_df['chunk_id'] == chunk_id]
                if not row.empty:
                    text_content = row.iloc[0]['text']
                    doc_id = row.iloc[0]['doc_id']
                    
                    print(f"🔍 [DEBUG] Found document - doc_id: {doc_id}")
                    print(f"🔍 [DEBUG] Text preview: {text_content[:100]}...")
                    
                    documents.append(Document(
                        page_content=text_content,
                        metadata={'chunk_id': chunk_id, 'doc_id': doc_id, 'distance': float(distances[0][i])}
                    ))
                else:
                    print(f"🔍 [DEBUG] WARNING: chunk_id {chunk_id} not found in DataFrame")
            else:
                print(f"🔍 [DEBUG] WARNING: index {idx} not found in id_mapping")
        
        print(f"🔍 [DEBUG] Final result: {len(documents)} documents retrieved")
        return documents

print("Setup complete!")

ModuleNotFoundError: No module named 'dotenv'

In [None]:
# Initialize the local retriever (no database needed)
local_retriever = LocalRetrieverDebug(
    embeddings=embeddings,
    faiss_index=faiss_index,
    id_mapping=id_mapping,
    documents_df=df,
    k=5  # Get top 5 documents
)

print("Local retriever initialized successfully!")

Local retriever initialized successfully!


In [None]:
# Build the QA chain with the local retriever
print("🔧 [DEBUG] Building QA chain...")
qa_chain = build_qa_chain(retriever=local_retriever)
print("🔧 [DEBUG] QA chain built successfully")

# Define a query and invoke the chain
query = "Mi a bűnszervezet fogalma a Btk. szerint?"
print(f"\n🚀 [DEBUG] Starting QA process with query: '{query}'")

try:
    # First, let's test the retriever directly
    print("\n📖 [DEBUG] Testing retriever directly...")
    retrieved_docs = local_retriever.invoke(query)  # ✅ Javított verzió
    
    print(f"📖 [DEBUG] Retriever returned {len(retrieved_docs)} documents")
    for i, doc in enumerate(retrieved_docs):
        print(f"📖 [DEBUG] Document {i+1}:")
        print(f"  - Content length: {len(doc.page_content)} characters")
        print(f"  - Metadata: {doc.metadata}")
        print(f"  - Content preview: {doc.page_content[:150]}...")
        print()
    
    # Now let's invoke the full chain
    print("🔗 [DEBUG] Invoking full QA chain...")
    result = qa_chain.invoke(query)
    
    # Print the final answer
    print(f"\n❓ Query: {query}\n")
    print("✅ Final Answer:\n")
    print(textwrap.fill(result, width=100))
    
except Exception as e:
    print(f"❌ [DEBUG] Error occurred: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()

🔧 [DEBUG] Building QA chain...
🔧 [DEBUG] QA chain built successfully

🚀 [DEBUG] Starting QA process with query: 'Mi a bűnszervezet fogalma a Btk. szerint?'

📖 [DEBUG] Testing retriever directly...

🔍 [DEBUG] Starting retrieval for query: 'Mi a bűnszervezet fogalma a Btk. szerint?'
🔍 [DEBUG] Getting query embedding...
🔍 [DEBUG] Query embedding shape: (1, 1536)
🔍 [DEBUG] Searching FAISS index for top 5 matches...
🔍 [DEBUG] Found indices: [18778 60987 19714 14937 46180]
🔍 [DEBUG] Distances: [1.8350865 1.8414775 1.8466614 1.8472267 1.8483781]
🔍 [DEBUG] Converting 5 indices to documents...
🔍 [DEBUG] Processing index 1/5: 18778
🔍 [DEBUG] Mapped to chunk_id: 26176412-37c8-432d-b26e-1a1f46014b5c
🔍 [DEBUG] Found document - doc_id: Pfv.20150/2008/5
🔍 [DEBUG] Text preview: módon - utal, vagy akinek a személye a sajtóközlemény tartalmából felismerhető. A kifogásolt közlés ...
🔍 [DEBUG] Processing index 2/5: 60987
🔍 [DEBUG] Mapped to chunk_id: cc66a91d-0ae6-4320-9c64-0cee56871123
🔍 [DEBUG] Found do

In [None]:
# Test with multiple query variations to diagnose embedding issues
test_queries = [
    "Mi a bűnszervezet fogalma a Btk. szerint?",
    "bűnszervezet három vagy több személy",
    "Btk 459 bűnszervezet fogalma", 
    "büntetőjogi bűnszervezet definíció",
    "criminal organization definition three persons"  # English test
]

print("🧪 [DEBUG] Testing multiple query variations...\n")

for i, query in enumerate(test_queries, 1):
    print(f"📝 Test {i}/5: '{query}'")
    
    # Test retrieval
    retrieved_docs = local_retriever.invoke(query)
    
    if retrieved_docs:
        best_doc = retrieved_docs[0]
        distance = best_doc.metadata.get('distance', 'N/A')
        doc_id = best_doc.metadata.get('doc_id', 'N/A')
        
        print(f"  ✅ Best match: {doc_id} (distance: {distance:.4f})")
        print(f"  📄 Preview: {best_doc.page_content[:100]}...")
        
        # Check if it contains "bűnszervezet" 
        if 'bűnszervezet' in best_doc.page_content.lower():
            print(f"  🎯 CONTAINS bűnszervezet!")
        else:
            print(f"  ❌ No bűnszervezet mention")
    else:
        print(f"  ❌ No documents found")
    
    print("\n" + "-"*60 + "\n")


🧪 [DEBUG] Testing multiple query variations...

📝 Test 1/5: 'Mi a bűnszervezet fogalma a Btk. szerint?'

🔍 [DEBUG] Starting retrieval for query: 'Mi a bűnszervezet fogalma a Btk. szerint?'
🔍 [DEBUG] Getting query embedding...
🔍 [DEBUG] Query embedding shape: (1, 1536)
🔍 [DEBUG] Searching FAISS index for top 5 matches...
🔍 [DEBUG] Found indices: [18778 60987 19714 14937 59587]
🔍 [DEBUG] Distances: [1.8351086 1.8413956 1.8466256 1.8472317 1.8483465]
🔍 [DEBUG] Converting 5 indices to documents...
🔍 [DEBUG] Processing index 1/5: 18778
🔍 [DEBUG] Mapped to chunk_id: 26176412-37c8-432d-b26e-1a1f46014b5c
🔍 [DEBUG] Found document - doc_id: Pfv.20150/2008/5
🔍 [DEBUG] Text preview: módon - utal, vagy akinek a személye a sajtóközlemény tartalmából felismerhető. A kifogásolt közlés ...
🔍 [DEBUG] Processing index 2/5: 60987
🔍 [DEBUG] Mapped to chunk_id: cc66a91d-0ae6-4320-9c64-0cee56871123
🔍 [DEBUG] Found document - doc_id: G.20951/2006/33
🔍 [DEBUG] Text preview: átvételekor az alperes további enged

In [None]:
# Direct test: Check the document at FAISS index 3 (which contains bűnszervezet definition)
try:
    print("🧪 [DEBUG] Direct test: Retrieving document at FAISS index 3...")
    
    # Get the document directly by index
    index_3_chunk_id = id_mapping[3]
    doc_row = df[df['chunk_id'] == index_3_chunk_id].iloc[0]
    
    print(f"📄 Document at index 3:")
    print(f"   Doc ID: {doc_row['doc_id']}")
    print(f"   Text preview: {doc_row['text'][:300]}...")
    
    # Test a simple query against this specific document manually
    print(f"\n🧪 Let's manually test if our retriever can find index 3...")
    
    # Force search to return more results to see if index 3 appears
    test_query = "bűnszervezet"
    query_embedding = embeddings.embed_query(test_query)
    query_vector = np.array([query_embedding], dtype='float32')
    
    # Search for top 50 to see if index 3 appears anywhere
    distances, indices = faiss_index.search(query_vector, 50)
    
    print(f"Top 50 search results for '{test_query}':")
    print(f"Indices: {indices[0]}")
    print(f"Distances: {distances[0]}")
    
    if 3 in indices[0]:
        pos = list(indices[0]).index(3)
        print(f"✅ Index 3 found at position {pos+1} with distance {distances[0][pos]:.4f}")
    else:
        print(f"❌ Index 3 NOT found in top 50 results!")
        print(f"This confirms the embedding model cannot properly match Hungarian legal text.")
        
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()


🧪 [DEBUG] Direct test: Retrieving document at FAISS index 3...
📄 Document at index 3:
   Doc ID: Bf.23/2020/67
   Text preview: A módosítás 2019. július 10. napján lépett hatályba. [118] A korábban hatályban volt Btk. 459. § (1) bekezdés I. pontja szerint bűnszervezet a három vagy több személyből álló, hosszabb időre szervezett, összehangoltan működő csoport, amelynek célja ötévi vagy ezt meghaladó szabadságvesztéssel büntet...

🧪 Let's manually test if our retriever can find index 3...
Top 50 search results for 'bűnszervezet':
Indices: [51614  6449 20124 20129 48034 14935 17725 11949 12923  2148 18778  4696
 21549 14937 13853 15161 35022 61039 14860 53445  9368 54404 24744 52184
 52205 12700  2101 21545 21668 52158 19120 35051 19714 19923 55324 19154
 14633 17637 60987 10766 18550 14773 21202 18399 19347 19374 54840 12986
  5351  9961]
Distances: [1.8285353 1.8347057 1.8355167 1.8355167 1.838312  1.838475  1.8428354
 1.8431315 1.8433175 1.8434399 1.8441244 1.8441575 1.8455366 1.845714

In [None]:
# Direct test: Check the document at FAISS index 3 (which contains bűnszervezet definition)
try:
    print("🧪 [DEBUG] Direct test: Retrieving document at FAISS index 3...")
    
    # Get the document directly by index
    index_3_chunk_id = id_mapping[3]
    doc_row = df[df['chunk_id'] == index_3_chunk_id].iloc[0]
    
    print(f"📄 Document at index 3:")
    print(f"   Doc ID: {doc_row['doc_id']}")
    print(f"   Text preview: {doc_row['text'][:300]}...")
    
    # Test a simple query against this specific document manually
    print(f"\n🧪 Let's manually test if our retriever can find index 3...")
    
    # Force search to return more results to see if index 3 appears
    test_query = "bűnszervezet"
    query_embedding = embeddings.embed_query(test_query)
    query_vector = np.array([query_embedding], dtype='float32')
    
    # Search for top 50 to see if index 3 appears anywhere
    distances, indices = faiss_index.search(query_vector, 50)
    
    print(f"Top 50 search results for '{test_query}':")
    print(f"Indices: {indices[0]}")
    print(f"Distances: {distances[0]}")
    
    if 3 in indices[0]:
        pos = list(indices[0]).index(3)
        print(f"✅ Index 3 found at position {pos+1} with distance {distances[0][pos]:.4f}")
    else:
        print(f"❌ Index 3 NOT found in top 50 results!")
        print(f"This confirms the embedding model cannot properly match Hungarian legal text.")
        
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()


🧪 [DEBUG] Direct test: Retrieving document at FAISS index 3...
📄 Document at index 3:
   Doc ID: Bf.23/2020/67
   Text preview: A módosítás 2019. július 10. napján lépett hatályba. [118] A korábban hatályban volt Btk. 459. § (1) bekezdés I. pontja szerint bűnszervezet a három vagy több személyből álló, hosszabb időre szervezett, összehangoltan működő csoport, amelynek célja ötévi vagy ezt meghaladó szabadságvesztéssel büntet...

🧪 Let's manually test if our retriever can find index 3...
Top 50 search results for 'bűnszervezet':
Indices: [51614  6449 20124 20129 48034 14935 17725 11949 12923  2148 18778  4696
 21549 14937 13853 15161 35022 61039 14860 53445  9368 54404 24744 52184
 52205 12700  2101 21545 21668 52158 19120 35051 19714 19923 55324 19154
 14633 17637 60987 10766 18550 14773 21202 18399 19347 19374 54840 12986
  5351  9961]
Distances: [1.8285353 1.8347057 1.8355167 1.8355167 1.838312  1.838475  1.8428354
 1.8431315 1.8433175 1.8434399 1.8441244 1.8441575 1.8455366 1.845714

In [None]:
# Test the hybrid retriever with our original query
test_query = "Mi a bűnszervezet fogalma a Btk. szerint?"

print(f"🧪 Testing HYBRID retriever with: '{test_query}'")
print("=" * 80)

# Test hybrid retriever
hybrid_results = hybrid_retriever.invoke(test_query)

print(f"\n📄 HYBRID RESULTS:")
for i, doc in enumerate(hybrid_results, 1):
    print(f"\n{i}. Document: {doc.metadata['doc_id']}")
    print(f"   Source: {doc.metadata['source']}")
    print(f"   Distance: {doc.metadata['distance']}")
    print(f"   Text preview: {doc.page_content[:200]}...")
    
    # Check if it actually contains bűnszervezet
    if 'bűnszervezet' in doc.page_content.lower():
        print("   ✅ CONTAINS bűnszervezet!")
    else:
        print("   ❌ No bűnszervezet mention")

# Now test with the hybrid retriever in the QA chain
print(f"\n" + "=" * 80)
print("🔗 Testing HYBRID retriever in QA chain...")

hybrid_qa_chain = build_qa_chain(retriever=hybrid_retriever)
result = hybrid_qa_chain.invoke(test_query)

print(f"\n❓ Query: {test_query}")
print(f"\n✅ HYBRID QA RESULT:")
print("-" * 50)
print(textwrap.fill(result, width=100))


In [None]:
# Direct test: Check the document at FAISS index 3 (which contains bűnszervezet definition)
try:
    print("🧪 [DEBUG] Direct test: Retrieving document at FAISS index 3...")
    
    # Get the document directly by index
    index_3_chunk_id = id_mapping[3]
    doc_row = df[df['chunk_id'] == index_3_chunk_id].iloc[0]
    
    print(f"📄 Document at index 3:")
    print(f"   Doc ID: {doc_row['doc_id']}")
    print(f"   Text preview: {doc_row['text'][:300]}...")
    
    # Test a simple query against this specific document manually
    print(f"\n🧪 Let's manually test if our retriever can find index 3...")
    
    # Force search to return more results to see if index 3 appears
    test_query = "bűnszervezet"
    query_embedding = embeddings.embed_query(test_query)
    query_vector = np.array([query_embedding], dtype='float32')
    
    # Search for top 50 to see if index 3 appears anywhere
    distances, indices = faiss_index.search(query_vector, 50)
    
    print(f"Top 50 search results for '{test_query}':")
    print(f"Indices: {indices[0]}")
    print(f"Distances: {distances[0]}")
    
    if 3 in indices[0]:
        pos = list(indices[0]).index(3)
        print(f"✅ Index 3 found at position {pos+1} with distance {distances[0][pos]:.4f}")
    else:
        print(f"❌ Index 3 NOT found in top 50 results!")
        print(f"This confirms the embedding model cannot properly match Hungarian legal text.")
        
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()


🧪 [DEBUG] Direct test: Retrieving document at FAISS index 3...
📄 Document at index 3:
   Doc ID: Bf.23/2020/67
   Text preview: A módosítás 2019. július 10. napján lépett hatályba. [118] A korábban hatályban volt Btk. 459. § (1) bekezdés I. pontja szerint bűnszervezet a három vagy több személyből álló, hosszabb időre szervezett, összehangoltan működő csoport, amelynek célja ötévi vagy ezt meghaladó szabadságvesztéssel büntet...

🧪 Let's manually test if our retriever can find index 3...
Top 50 search results for 'bűnszervezet':
Indices: [51614  6449 20124 20129 48034 14935 17725 11949 12923  2148 18778  4696
 21549 14937 13853 15161 35022 61039 14860 53445  9368 54404 24744 52184
 52205 12700  2101 21545 21668 52158 19120 35051 19714 19923 55324 19154
 14633 17637 60987 10766 18550 14773 21202 18399 19347 19374 54840 12986
  5351  9961]
Distances: [1.8285353 1.8347057 1.8355167 1.8355167 1.838312  1.838475  1.8428354
 1.8431315 1.8433175 1.8434399 1.8441244 1.8441575 1.8455366 1.845714