In [None]:
from utils_openai import (
    setup_openai_api, create_embeddings, create_llm,
    load_msme_data, create_vectorstore, get_baseline_prompt
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.load import dumps, loads

print("[OK] Imports successful!")

In [None]:
api_key = setup_openai_api()
embeddings = create_embeddings(api_key)
llm = create_llm(api_key)
docs, metas, ids = load_msme_data('msme.csv')
vectorstore = create_vectorstore(docs, metas, ids, embeddings, 'msme_t3', './chroma_db_t3')
base_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
print('[OK] Base retriever ready!')

In [None]:
compressor = LLMChainExtractor.from_llm(llm)
print('[OK] Compressor created!')

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)
print('[OK] Compression retriever ready!')

In [None]:
prompt = get_baseline_prompt()

compression_rag_chain = (
    {'context': compression_retriever, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
print('[OK] Compression RAG chain ready!')

In [None]:
def estimate_tokens(text):
    """
    Rough token estimate (acceptable for analysis).
    ~1 token â‰ˆ 4 characters (OpenAI rule of thumb)
    """
    return len(text) // 4


In [None]:
def inspect_retrieval(retriever, query, label):
    docs = retriever.get_relevant_documents(query)
    combined_text = "\n\n".join(doc.page_content for doc in docs)
    
    print(f"\n--- {label} ---")
    print(f"Documents retrieved: {len(docs)}")
    print(f"Estimated tokens: {estimate_tokens(combined_text)}")
    
    return combined_text


In [None]:
query = "How can small businesses access government funding?"

base_context = inspect_retrieval(
    base_retriever,
    query,
    label="BASE RETRIEVER (NO COMPRESSION)"
)

compressed_context = inspect_retrieval(
    compression_retriever,
    query,
    label="CONTEXTUAL COMPRESSION RETRIEVER"
)
