### Benchmarking 

In [1]:
import os
import time
import pandas as pd
import numpy as np
import openpyxl
from tqdm import tqdm
from dotenv import load_dotenv
import warnings

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from sklearn.metrics.pairwise import cosine_similarity
import yaml
import json

# from pydantic import BaseModel, Field
from typing import List

from utils import DocumentCategorizer, CostTracker

# Suppress LangChain tracer warnings
warnings.filterwarnings("ignore", message="Error in LangChainTracer")

# --- Configuration ---
load_dotenv()
# pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Disable LangChain tracing to avoid serialization errors
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["LANGCHAIN_TRACING"] = "false"

# Load the configuration
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [2]:
# Import questions as golden source
golden_df = pd.read_excel("questions_benchmarking.xlsx")
golden_df = golden_df.head(10)

golden_df.head()

Unnamed: 0,Question_ID,Question_Text,Reference,TOC_1,TOC_2,TOC_3,TOC_4,TOC_5
0,1,What is the definition of a 'rating system' fo...,Article 142(1)(1),169,4,5,6,7
1,2,What is the minimum asset threshold for a fina...,Article 142(1)(4)(a),169,4,5,6,7
2,3,"According to the IRB approach, for which risk ...",Article 143(2),170,4,5,6,7
3,4,What is the key standard a competent authority...,Article 144(1)(a),171,4,5,6,7
4,5,For an institution to receive permission to us...,Article 144(1)(b),171,4,5,6,7


In [3]:
# Import knowledge base with embeddings
docs_df = pd.read_parquet("data/df_with_embeddings.parquet")
# docs_df = docs_df.head(20)
docs_df = docs_df[docs_df['Token_Count'] < 20_000]
# docs_df = docs_df.sample(50)
docs_df = docs_df[135:220]

docs_df.head()

Unnamed: 0,Part,Title,Chapter,Section,Subsection,Part_Heading,Title_Heading,Chapter_Heading,Section_Heading,Subsection_Heading,Token_Count,Ends_With_Dot,Article_Number,Article_Heading,Text,Text_With_Pagebreaks,id,Category_1,Category_2,combined_text,embedding
135,PART THREE,TITLE II,CHAPTER 2,Section 2,,CAPITAL REQUIREMENTS,CAPITAL REQUIREMENTS FOR CREDIT RISK,Standardised approach,Risk weights,,383,True,114,Article 114 - Exposures to central governments...,1. Exposures to central governments and centra...,1. Exposures to central governments and centra...,135,Credit Risk,Risk Weights and Correlations,Part_Heading: CAPITAL REQUIREMENTS\nTitle_Head...,"[-0.0010255621746182442, 0.01177041046321392, ..."
136,PART THREE,TITLE II,CHAPTER 2,Section 2,,CAPITAL REQUIREMENTS,CAPITAL REQUIREMENTS FOR CREDIT RISK,Standardised approach,Risk weights,,704,True,115,Article 115 - Exposures to regional government...,-1. Exposures to regional governments or local...,-1. Exposures to regional governments or local...,136,Credit Risk,Risk Weights and Correlations,Part_Heading: CAPITAL REQUIREMENTS\nTitle_Head...,"[0.004265729337930679, 0.027158882468938828, 0..."
137,PART THREE,TITLE II,CHAPTER 2,Section 2,,CAPITAL REQUIREMENTS,CAPITAL REQUIREMENTS FOR CREDIT RISK,Standardised approach,Risk weights,,475,True,116,Article 116 - Exposures to public sector entities,1. Exposures to public sector entities for whi...,1. Exposures to public sector entities for whi...,137,Credit Risk,Risk Weights and Correlations,Part_Heading: CAPITAL REQUIREMENTS\nTitle_Head...,"[-0.006716115400195122, 0.00666304724290967, 0..."
138,PART THREE,TITLE II,CHAPTER 2,Section 2,,CAPITAL REQUIREMENTS,CAPITAL REQUIREMENTS FOR CREDIT RISK,Standardised approach,Risk weights,,397,True,117,Article 117 - Exposures to multilateral develo...,1. Exposures to multilateral development banks...,1. Exposures to multilateral development banks...,138,Credit Risk,Risk Weights and Correlations,Part_Heading: CAPITAL REQUIREMENTS\nTitle_Head...,"[-0.004944275598973036, 0.0316530279815197, 0...."
139,PART THREE,TITLE II,CHAPTER 2,Section 2,,CAPITAL REQUIREMENTS,CAPITAL REQUIREMENTS FOR CREDIT RISK,Standardised approach,Risk weights,,110,True,118,Article 118 - Exposures to international organ...,Exposures to the following international organ...,Exposures to the following international organ...,139,Credit Risk,Risk Weights and Correlations,Part_Heading: CAPITAL REQUIREMENTS\nTitle_Head...,"[-0.007577173411846161, 0.045905862003564835, ..."


In [4]:
# Classes and functions
class TocList(BaseModel):
    """A data model for a list of Table of Contents IDs."""
    ids: List[str] = Field(description="An array of document ID strings.")

    
class DocumentRetriever:
    def __init__(self, df, config, cost_tracker, skip_embedding_generation=False):
        self.df = df.copy()
        self.config = config
        self.cost_tracker = cost_tracker
        self.llm_cache = {}
        self.embedding_model = OpenAIEmbeddings(model=config["embedding_model"])
        self.llm = ChatOpenAI(model=config["llm_model"], temperature=0)
        self.categorizer = DocumentCategorizer(config["Category_1"], config["Category_2"], self.llm, cost_tracker)
        
        if not skip_embedding_generation:
            print("Generating document embeddings...")
            texts, total_tokens = self.df['Text'].tolist(), sum(len(t)//4 for t in self.df['Text'])
            self.cost_tracker.add_cost(total_tokens, 'embedding', 'setup_embedding')
            self.df['embedding'] = self.embedding_model.embed_documents(texts)
        else:
            print("Using existing embeddings from dataframe...")
            # Ensure embeddings are in the right format (list of floats)
            if 'embedding' in self.df.columns:
                self.df['embedding'] = self.df['embedding'].apply(lambda x: x if isinstance(x, list) else list(x))
        
    def _categorize_question(self, question, question_id):
        if hasattr(self, '_cat_cache') and question_id in self._cat_cache: 
            return self._cat_cache[question_id]
        input_tokens, (cat1, cat2) = len(question)//4, self.categorizer.categorize_text(question)
        output_tokens = len(cat1)//4 + len(cat2)//4
        self.cost_tracker.add_cost(input_tokens, 'llm_input', 'query_categorization')
        self.cost_tracker.add_cost(output_tokens, 'llm_output', 'query_categorization')
        if not hasattr(self, '_cat_cache'): 
            self._cat_cache = {}
        self._cat_cache[question_id] = (cat1, cat2)
        return cat1, cat2
        
    def retrieve(self, s_name, question, k, q_id):
        start = time.time()
        
        # Always categorize the question once, regardless of strategy
        # This ensures categories are consistent across all strategies
        categories = list(self._categorize_question(question, q_id))
        
        if s_name == 'A': 
            tocs, cost = self.retrieve_by_embedding(question, k)
        elif s_name == 'B': 
            # The method will use cached categories, won't re-categorize
            tocs, cost, _ = self.retrieve_by_category(question, k, q_id)
        elif s_name == 'C': 
            # The method will use cached categories, won't re-categorize
            tocs, cost, _ = self.retrieve_hybrid(question, k, q_id)
        elif s_name == 'D': 
            tocs, cost = self.retrieve_full_context(question, k)
        else: 
            raise ValueError(f"Unknown strategy: {s_name}")
            
        return tocs, (time.time() - start) * 1000, cost, categories
                    
    def retrieve_by_embedding(self, question, k):
        q_emb = self.embedding_model.embed_query(question)
        cost = self.cost_tracker.add_cost(len(question)//4, "embedding", "query_embedding")
        sims = cosine_similarity([q_emb], np.array(self.df['embedding'].tolist()))[0]
        # FIX: Convert IDs to string for correct evaluation
        return self.df.iloc[np.argsort(sims)[::-1][:k]]['id'].astype(str).tolist(), cost
    
    def retrieve_by_category(self, question, k, q_id):
        cat1, cat2 = self._categorize_question(question, q_id)
        
        # FIX: Use more robust filtering to create a better candidate pool,
        # combining the specific sub-category with the broader parent category.
        df_filtered = pd.concat([
            self.df[(self.df['Category_1'] == cat1) & (self.df['Category_2'] == cat2)],
            self.df[self.df['Category_1'] == cat1]
        ]).drop_duplicates(subset=['id']).reset_index(drop=True)
    
        if df_filtered.empty:
            return [], 0, [cat1, cat2]
        
        # FIX: Convert IDs to string for correct evaluation
        # This strategy remains "zero-cost" by taking the first k items without semantic search.
        return df_filtered['id'].head(k).astype(str).tolist(), 0, [cat1, cat2]
    
    def retrieve_hybrid(self, question, k, q_id):
        cost = self.cost_tracker.add_cost(len(question)//4, "embedding", "query_embedding")
        cat1, cat2 = self._categorize_question(question, q_id)
        
        # This filtering logic is already robust.
        df = pd.concat([
            self.df[(self.df['Category_1']==cat1)&(self.df['Category_2']==cat2)],
            self.df[self.df['Category_1']==cat1]
        ]).drop_duplicates(subset=['id']).reset_index(drop=True)
        
        if df.empty: return [], cost, [cat1, cat2]
        
        q_emb = self.embedding_model.embed_query(question)
        df['sim'] = cosine_similarity([q_emb], np.array(df['embedding'].tolist()))[0]
        
        # FIX: Convert IDs to string for correct evaluation
        return df.sort_values('sim', ascending=False).head(k)['id'].astype(str).tolist(), cost, [cat1, cat2]
        
    def retrieve_full_context(self, question, k):
        if self.config['use_cache'] and question in self.llm_cache:
            return self.llm_cache[question][0], 0
    
        valid_ids = set(self.df['id'].astype(str).tolist())
        context_str = "\n".join([f"ID {row['id']}: {row['Text']}" for _, row in self.df.iterrows()])
    
        # Configure the LLM to use the Pydantic model for structured output
        structured_llm = self.llm.with_structured_output(TocList)
    
        prompt = f"""Based on the documents provided, select the {k} most relevant document IDs for the following question.
    
    Documents:
    {context_str}
    
    Question: {question}"""
    
        # The 'invoke' method with a structured output model returns a Pydantic object
        response_model = structured_llm.invoke(prompt)
        tocs = response_model.ids
        
        # Recreate the JSON string output for accurate cost calculation
        output_str = json.dumps({"ids": tocs})
    
        # Validate the returned IDs against the list of valid IDs
        tocs = [toc_id for toc_id in tocs if toc_id in valid_ids][:k]
    
        # If the model returns fewer IDs than requested, fill the remainder
        if len(tocs) < k:
            remaining_ids = [vid for vid in valid_ids if vid not in tocs]
            tocs.extend(remaining_ids[:k - len(tocs)])
    
        cost = self.cost_tracker.add_cost(len(prompt) // 4, 'llm_input', 'query_llm_context') + \
               self.cost_tracker.add_cost(len(output_str) // 4, 'llm_output', 'query_llm_context')
    
        if self.config['use_cache']:
            self.llm_cache[question] = (tocs, cost)
    
        return tocs, cost

class Evaluator:
    def __init__(self, golden_source_df, k):
        self.k = k
        self.golden_df = golden_source_df
        toc_cols = [f'TOC_{i}' for i in range(1, 6)]
        self.golden_map = {row['Question_ID']: set(row[toc_cols].astype(str).values) for _, row in self.golden_df.iterrows()}
            
    def evaluate_results(self, results_df):
        """Evaluate results from the DataFrame"""
        # Add evaluation metrics
        results_df['golden_tocs'] = results_df['question_id'].map(
            lambda q_id: list(self.golden_map.get(q_id, set()))
        )
        results_df['matches'] = results_df.apply(
            lambda row: len(set(row['retrieved_tocs']).intersection(set(row['golden_tocs']))), axis=1
        )
        results_df['precision'] = results_df['matches'] / self.k if self.k > 0 else 0
        return results_df


# Check for OpenAI API key
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY environment variable not set. Please create a .env file or set it in your notebook.")

# Initialize components
cost_tracker = CostTracker(config)
llm = ChatOpenAI(model=config["llm_model"], temperature=0)



# Print available columns for debugging
print(f"Available columns in parquet file: {docs_df.columns.tolist()}")
print(f"Number of documents: {len(docs_df)}")

# Check for required columns
if 'combined_text' not in docs_df.columns:
    raise ValueError("No 'combined_text' column found in the parquet file. Please check column names.")
if 'embedding' not in docs_df.columns:
    raise ValueError("No 'embedding' column found in the parquet file.")

# Keep only relevant columns to save memory and avoid duplicates
docs_df = docs_df[['id', 'combined_text', 'embedding', 'Category_1', 'Category_2']].copy()
docs_df = docs_df.rename(columns={'combined_text': 'Text'})

# Verify embedding format (convert to list if needed)
first_embedding = docs_df['embedding'].iloc[0]
if isinstance(first_embedding, np.ndarray):
    print("Converting embeddings from numpy arrays to lists...")
    docs_df['embedding'] = docs_df['embedding'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

print(f"Sample text (first 200 chars): {docs_df['Text'].iloc[0][:200]}...")
print(f"Embedding dimension: {len(docs_df['embedding'].iloc[0])}")



# Initialize retriever with existing embeddings
skip_embedding_generation = 'embedding' in docs_df.columns
retriever = DocumentRetriever(docs_df, config, cost_tracker, skip_embedding_generation=skip_embedding_generation)

# Run retrieval strategies for all questions - SIMPLIFIED DATA STRUCTURE
all_results = []  # List of dictionaries instead of nested dict
print("\nRunning retrieval strategies for all questions...")

for _, row in tqdm(golden_df.iterrows(), total=len(golden_df), desc="Evaluating Questions"):
    q_id, question = row['Question_ID'], row['Question_Text']
    
    # Reset categorization cost tracking for this question
    cost_tracker.cost_breakdown['query_categorization'] = 0
    q_cat_cost_before = cost_tracker.cost_breakdown['query_categorization']
    
    # Run each strategy and collect results
    for strategy_code in ['A', 'B', 'C', 'D']:
        tocs, latency, cost, categories = retriever.retrieve(strategy_code, question, config["retrieval_k"], q_id)
        
        # Calculate total cost including categorization if applicable
        if strategy_code in ['B', 'C']:
            total_cost = cost + (cost_tracker.cost_breakdown['query_categorization'] - q_cat_cost_before)
        else:
            total_cost = cost
            
        # Append flat dictionary for this question-strategy combination
        all_results.append({
            'question_id': q_id,
            'question_text': question,
            'strategy': strategy_code,
            'retrieved_tocs': tocs,
            'latency': latency,
            'cost': cost,
            'categorization_cost': cost_tracker.cost_breakdown['query_categorization'] - q_cat_cost_before if strategy_code in ['B', 'C'] else 0,
            'total_cost': total_cost,
            'categories': categories  # Will be None for strategies A and D
        })
    
    # Clear categorization cache after each question
    if hasattr(retriever, '_cat_cache'): 
        retriever._cat_cache.clear()

# Convert to DataFrame with one-liner
results_df = pd.DataFrame(all_results)

# Generate evaluation report
evaluator = Evaluator(golden_df, config["retrieval_k"])
results_df = evaluator.evaluate_results(results_df)

Available columns in parquet file: ['Part', 'Title', 'Chapter', 'Section', 'Subsection', 'Part_Heading', 'Title_Heading', 'Chapter_Heading', 'Section_Heading', 'Subsection_Heading', 'Token_Count', 'Ends_With_Dot', 'Article_Number', 'Article_Heading', 'Text', 'Text_With_Pagebreaks', 'id', 'Category_1', 'Category_2', 'combined_text', 'embedding']
Number of documents: 85
Converting embeddings from numpy arrays to lists...
Sample text (first 200 chars): Part_Heading: CAPITAL REQUIREMENTS
Title_Heading: CAPITAL REQUIREMENTS FOR CREDIT RISK
Chapter_Heading: Standardised approach
Section_Heading: Risk weights
Article_Heading: Article 114 - Exposures to ...
Embedding dimension: 1536
Using existing embeddings from dataframe...

Running retrieval strategies for all questions...


Evaluating Questions: 100%|████████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.95s/it]


In [5]:
results_df

Unnamed: 0,question_id,question_text,strategy,retrieved_tocs,latency,cost,categorization_cost,total_cost,categories,golden_tocs,matches,precision
0,1,What is the definition of a 'rating system' fo...,A,"[194, 171, 172, 175, 200]",2093.179226,4e-07,0.0,4e-07,"[IRB Approach Requirements, Other]","[7, 6, 5, 169, 4]",0,0.0
1,1,What is the definition of a 'rating system' fo...,B,"[171, 172, 175, 176]",2.470732,0.0,7e-06,7.2e-06,"[IRB Approach Requirements, Other]","[7, 6, 5, 169, 4]",0,0.0
2,1,What is the definition of a 'rating system' fo...,C,"[171, 172, 175, 176]",528.44739,4e-07,7e-06,7.6e-06,"[IRB Approach Requirements, Other]","[7, 6, 5, 169, 4]",0,0.0
3,1,What is the definition of a 'rating system' fo...,D,"[169, 170, 171, 172, 173]",2280.378103,0.0072384,0.0,0.0072384,"[IRB Approach Requirements, Other]","[7, 6, 5, 169, 4]",1,0.2
4,2,What is the minimum asset threshold for a fina...,A,"[169, 175, 177, 174, 171]",2160.121441,8.6e-07,0.0,8.6e-07,"[Credit Risk, IRB Approach Requirements]","[7, 6, 5, 169, 4]",1,0.2
5,2,What is the minimum asset threshold for a fina...,B,"[185, 194, 199, 200, 211]",2.782583,0.0,1.1e-05,1.125e-05,"[Credit Risk, IRB Approach Requirements]","[7, 6, 5, 169, 4]",0,0.0
6,2,What is the minimum asset threshold for a fina...,C,"[169, 177, 174, 195, 142]",360.909224,8.6e-07,1.1e-05,1.211e-05,"[Credit Risk, IRB Approach Requirements]","[7, 6, 5, 169, 4]",1,0.2
7,2,What is the minimum asset threshold for a fina...,D,"[169, 170, 171, 172, 173]",1138.749361,0.007242,0.0,0.007242,"[Credit Risk, IRB Approach Requirements]","[7, 6, 5, 169, 4]",1,0.2
8,3,"According to the IRB approach, for which risk ...",A,"[172, 171, 170, 176, 204]",1405.457973,8e-07,0.0,8e-07,"[IRB Approach Requirements, IRB Approach Permi...","[7, 6, 5, 170, 4]",1,0.2
9,3,"According to the IRB approach, for which risk ...",B,"[175, 171, 172, 176]",2.666235,0.0,1.3e-05,1.26e-05,"[IRB Approach Requirements, IRB Approach Permi...","[7, 6, 5, 170, 4]",0,0.0
