In [1]:
# Import necessary packages
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import pickle
import faiss

# Import LangChain and OpenAI components
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate

# Import our custom retriever classes
from src.rag.retriever import CustomRetriever, RerankingRetriever

# Load environment variables from .env file
load_dotenv()

# Set the working directory to the project root if running from the 'notebooks' folder
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")
    
# Load data and models from paths specified in environment variables
df = pd.read_parquet(os.getenv("PARQUET_PATH"))
faiss_index = faiss.read_index(os.getenv("FAISS_INDEX_PATH"))
with open(os.getenv("ID_MAPPING_PATH"), "rb") as f:
    id_mapping = pickle.load(f)

# Initialize the OpenAI embedding model
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

In [2]:
# 1. Initialize the base CustomRetriever
# We retrieve more documents (k=20) initially to provide a good selection for the reranker to work with.
custom_retriever = CustomRetriever(
    embeddings=embeddings,
    faiss_index=faiss_index,
    id_mapping=id_mapping,
    documents_df=df,
    k=20
)

# 2. Load and create the reranker prompt
with open("src/prompts/reranker_prompt.txt", "r", encoding="utf-8") as f:
    reranker_prompt_template = f.read()

reranker_prompt = PromptTemplate.from_template(reranker_prompt_template)

# 3. Initialize the reranker LLM
# We use an efficient yet cost-effective model (gpt-4o-mini).
# A temperature of 0 ensures deterministic, fact-based operation.
reranker_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

# 4. Initialize the RerankingRetriever
# This component ties the process together. The "k" parameter here 
# specifies the final number of documents to be returned to the user.
reranking_retriever = RerankingRetriever(
    retriever=custom_retriever,
    llm=reranker_llm,
    reranker_prompt=reranker_prompt,
    k=5
)

In [None]:
# A specific query asking for a legal definition in Hungarian
query = "Mi a polgári perrendtartásról szóló törvény szerint a felperes és az alperes fogalma, és mi a különbség közöttük?"

# Retrieve relevant documents using the RerankingRetriever
final_docs = reranking_retriever.get_relevant_documents(query)

# Print the results
print(f"Query: {query}\\n")
print("--- Reranked Documents ---\\n")

for i, doc in enumerate(final_docs, 1):
    print(f"📄 Document {i} (ID: {doc.metadata.get('doc_id')})")
    print(f"   Reranker Score: {doc.metadata.get('reranker_score')}")
    print(f"   Reason: {doc.metadata.get('reranker_reason')}")
    print(f"   Original Text: {doc.page_content[:400]}...\\n")

KeyError: 'Input to PromptTemplate is missing variables {\'\\n  "ranked_documents"\'}.  Expected: [\'\\n  "ranked_documents"\', \'documents\', \'k\', \'query\'] Received: [\'query\', \'documents\', \'k\']\nNote: if you intended {\n  "ranked_documents"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{\n  "ranked_documents"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '