In [1]:
import os
import time
import pandas as pd
import numpy as np
import openpyxl
from tqdm import tqdm
from dotenv import load_dotenv
import warnings

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from sklearn.metrics.pairwise import cosine_similarity
import yaml

from utils import DocumentCategorizer

# Suppress LangChain tracer warnings
warnings.filterwarnings("ignore", message="Error in LangChainTracer")

# --- Configuration ---
load_dotenv()

# Disable LangChain tracing to avoid serialization errors
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["LANGCHAIN_TRACING"] = "false"

# Load the configuration
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)
    
# --- Helper Functions & Setup ---
def create_dummy_golden_file():
    """Creates dummy golden source file for demonstration if it doesn't exist."""
    golden_file = "golden_source.xlsx"
    if not os.path.exists(golden_file):
        print(f"Creating dummy file: {golden_file}")
        # Adjusted TOC numbers to match the format we'll generate (row_index//10+1).(row_index%10+1)
        pd.DataFrame({
            "Question_ID": [f"Q{i}" for i in range(1, 6)],
            "Question_Text": [
                "What are the IRB requirements for PD models?",
                "How is LGD calculated under the SA approach?",
                "What is the difference between PD and in-default LGD?",
                "What are the key components of IRB models?",
                "How does SA handle LGD estimation?"
            ],
            "TOC_1": ["1.1", "1.2", "1.3", "1.4", "1.5"],
            "TOC_2": ["2.1", "2.2", "2.3", "2.4", "2.5"],
            "TOC_3": ["3.1", "3.2", "3.3", "3.4", "3.5"],
            "TOC_4": ["4.1", "4.2", "4.3", "4.4", "4.5"],
            "TOC_5": ["5.1", "5.2", "5.3", "5.4", "5.5"],
        }).to_excel(golden_file, index=False)
        print(f"Note: Please update {golden_file} with actual TOC references from your documents.")

# --- System Components ---

class CostTracker:
    def __init__(self, config):
        self.config, self.total_cost, self.cost_breakdown = config, 0, {
            "setup_categorization": 0, "setup_embedding": 0, "query_categorization": 0,
            "query_embedding": 0, "query_llm_context": 0
        }
    def _calculate_cost(self, tokens, type):
        if type == "embedding": return (tokens / 1_000_000) * self.config['cost_embedding_per_1m_tokens']
        if type == "llm_input": return (tokens / 1_000_000) * self.config['cost_llm_input_per_1m_tokens']
        if type == "llm_output": return (tokens / 1_000_000) * self.config['cost_llm_output_per_1m_tokens']
        return 0
        
    def add_cost(self, tokens, type, component):
        cost = self._calculate_cost(tokens, type)
        self.total_cost += cost
        if component in self.cost_breakdown: self.cost_breakdown[component] += cost
        return cost
        
    def get_summary(self): return {"total_cost": self.total_cost, "breakdown": self.cost_breakdown}

class DocumentRetriever:
    def __init__(self, df, config, cost_tracker, skip_embedding_generation=False):
        self.df, self.config, self.cost_tracker, self.llm_cache = df.copy(), config, cost_tracker, {}
        self.embedding_model = OpenAIEmbeddings(model=config["embedding_model"])
        self.llm = ChatOpenAI(model=config["llm_model"], temperature=0)
        self.categorizer = DocumentCategorizer(config["categories_1"], config["categories_2"], self.llm, cost_tracker)
        
        if not skip_embedding_generation:
            print("Generating document embeddings...")
            texts, total_tokens = self.df['Text'].tolist(), sum(len(t)//4 for t in self.df['Text'])
            self.cost_tracker.add_cost(total_tokens, 'embedding', 'setup_embedding')
            self.df['embedding'] = self.embedding_model.embed_documents(texts)
        else:
            print("Using existing embeddings from dataframe...")
            # Ensure embeddings are in the right format (list of floats)
            if 'embedding' in self.df.columns:
                self.df['embedding'] = self.df['embedding'].apply(lambda x: x if isinstance(x, list) else list(x))
        
    def _categorize_question(self, question, question_id):
        if hasattr(self, '_cat_cache') and question_id in self._cat_cache: return self._cat_cache[question_id]
        input_tokens, (cat1, cat2) = len(question)//4, self.categorizer.categorize_text(question)
        output_tokens = len(cat1)//4 + len(cat2)//4
        self.cost_tracker.add_cost(input_tokens, 'llm_input', 'query_categorization')
        self.cost_tracker.add_cost(output_tokens, 'llm_output', 'query_categorization')
        if not hasattr(self, '_cat_cache'): self._cat_cache = {}
        self._cat_cache[question_id] = (cat1, cat2)
        return cat1, cat2
        
    def retrieve(self, s_name, question, k, q_id):
        start = time.time()
        categories = None  # Initialize categories
        
        if s_name == 'A': 
            tocs, cost = self.retrieve_by_embedding(question, k)
        elif s_name == 'B': 
            tocs, cost, categories = self.retrieve_by_category(question, k, q_id)
        elif s_name == 'C': 
            tocs, cost, categories = self.retrieve_hybrid(question, k, q_id)
        elif s_name == 'D': 
            tocs, cost = self.retrieve_full_context(question, k)
        else: 
            raise ValueError(f"Unknown strategy: {s_name}")
            
        return tocs, (time.time() - start) * 1000, cost, categories
        
    def retrieve_by_embedding(self, question, k):
        q_emb = self.embedding_model.embed_query(question)
        cost = self.cost_tracker.add_cost(len(question)//4, "embedding", "query_embedding")
        sims = cosine_similarity([q_emb], np.array(self.df['embedding'].tolist()))[0]
        return self.df.iloc[np.argsort(sims)[::-1][:k]]['TOC_Number'].tolist(), cost
        
    def retrieve_by_category(self, question, k, q_id):
        cat1, cat2 = self._categorize_question(question, q_id)
        df = self.df[(self.df['Category_1'] == cat1) & (self.df['Category_2'] == cat2)]
        if df.empty: df = self.df[self.df['Category_1'] == cat1]
        return df['TOC_Number'].head(k).tolist(), 0, [cat1, cat2]
        
    def retrieve_hybrid(self, question, k, q_id):
        cost = self.cost_tracker.add_cost(len(question)//4, "embedding", "query_embedding")
        cat1, cat2 = self._categorize_question(question, q_id)
        # --- THIS IS THE CORRECTED LINE ---
        df = pd.concat([
            self.df[(self.df['Category_1']==cat1)&(self.df['Category_2']==cat2)], 
            self.df[self.df['Category_1']==cat1]
        ]).drop_duplicates(subset=['TOC_Number']).reset_index(drop=True)
        # --- END OF CORRECTION ---
        if df.empty: return [], cost, [cat1, cat2]
        q_emb = self.embedding_model.embed_query(question)
        df['sim'] = cosine_similarity([q_emb], np.array(df['embedding'].tolist()))[0]
        return df.sort_values('sim', ascending=False).head(k)['TOC_Number'].tolist(), cost, [cat1, cat2]
        
    def retrieve_full_context(self, question, k):
        if self.config['use_cache'] and question in self.llm_cache: return self.llm_cache[question][0], 0
        context_str = "\n".join([f"TOC {row['TOC_Number']}: {row['Text']}" for _, row in self.df.iterrows()])
        prompt = f"""Given the document context below, identify the TOP {k} `TOC_Number`s most relevant to the user's question. Return only a comma-separated list of TOC numbers (e.g., 1.1, 2.3, 3.2).
CONTEXT:
---
{context_str}
---
QUESTION: "{question}"
Relevant TOC_Numbers:
"""
        res = self.llm.invoke(prompt).content.strip()
        tocs = [t.strip() for t in res.split(',')]
        cost = self.cost_tracker.add_cost(len(prompt)//4, 'llm_input', 'query_llm_context') + \
               self.cost_tracker.add_cost(len(res)//4, 'llm_output', 'query_llm_context')
        if self.config['use_cache']: self.llm_cache[question] = (tocs, cost)
        return tocs, cost

class Evaluator:
    def __init__(self, golden_source_df, k):
        self.k = k
        self.golden_df = golden_source_df
        toc_cols = [f'TOC_{i}' for i in range(1, 6)]
        self.golden_map = {row['Question_ID']: set(row[toc_cols].astype(str).values) for _, row in self.golden_df.iterrows()}
            
    def evaluate_results(self, results_df):
        """Evaluate results from the DataFrame"""
        # Add evaluation metrics
        results_df['golden_tocs'] = results_df['question_id'].map(
            lambda q_id: list(self.golden_map.get(q_id, set()))
        )
        results_df['matches'] = results_df.apply(
            lambda row: len(set(row['retrieved_tocs']).intersection(set(row['golden_tocs']))), axis=1
        )
        results_df['precision'] = results_df['matches'] / self.k if self.k > 0 else 0
        return results_df


# --- Jupyter Notebook Execution Code ---
# Create dummy golden file if needed
create_dummy_golden_file()

# Check for OpenAI API key
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY environment variable not set. Please create a .env file or set it in your notebook.")

# Initialize components
cost_tracker = CostTracker(config)
llm = ChatOpenAI(model=config["llm_model"], temperature=0)

# Read the real parquet file with embeddings
print("Loading document dataset from parquet file...")
docs_df = pd.read_parquet("data/df_with_embeddings.parquet")
docs_df = docs_df.head(20)

# Print available columns for debugging
print(f"Available columns in parquet file: {docs_df.columns.tolist()}")
print(f"Number of documents: {len(docs_df)}")

# Check for required columns
if 'combined_text' not in docs_df.columns:
    raise ValueError("No 'combined_text' column found in the parquet file. Please check column names.")
if 'embedding' not in docs_df.columns:
    raise ValueError("No 'embedding' column found in the parquet file.")

# Keep only relevant columns to save memory and avoid duplicates
docs_df = docs_df[['combined_text', 'embedding']].copy()
docs_df = docs_df.rename(columns={'combined_text': 'Text'})

# Create TOC_Number based on index
docs_df['TOC_Number'] = [f"{i//10+1}.{i%10+1}" for i in range(len(docs_df))]

# Verify embedding format (convert to list if needed)
first_embedding = docs_df['embedding'].iloc[0]
if isinstance(first_embedding, np.ndarray):
    print("Converting embeddings from numpy arrays to lists...")
    docs_df['embedding'] = docs_df['embedding'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

print(f"Sample text (first 200 chars): {docs_df['Text'].iloc[0][:200]}...")
print(f"Embedding dimension: {len(docs_df['embedding'].iloc[0])}")

# Load golden source
golden_df = pd.read_excel("golden_source.xlsx")

# Categorize documents
print("\nCategorizing documents...")
categorizer = DocumentCategorizer(config["categories_1"], config["categories_2"], llm, cost_tracker)
categorized_df = categorizer.categorize_dataset(docs_df)
categorized_df.to_parquet("document_dataset_categorized.parquet", index=False)
print("Categorized documents saved to parquet.")

# Initialize retriever with existing embeddings
skip_embedding_generation = 'embedding' in categorized_df.columns
retriever = DocumentRetriever(categorized_df, config, cost_tracker, skip_embedding_generation=skip_embedding_generation)

# Run retrieval strategies for all questions - SIMPLIFIED DATA STRUCTURE
all_results = []  # List of dictionaries instead of nested dict
print("\nRunning retrieval strategies for all questions...")

for _, row in tqdm(golden_df.iterrows(), total=len(golden_df), desc="Evaluating Questions"):
    q_id, question = row['Question_ID'], row['Question_Text']
    
    # Reset categorization cost tracking for this question
    cost_tracker.cost_breakdown['query_categorization'] = 0
    q_cat_cost_before = cost_tracker.cost_breakdown['query_categorization']
    
    # Run each strategy and collect results
    for strategy_code in ['A', 'B', 'C', 'D']:
        tocs, latency, cost, categories = retriever.retrieve(strategy_code, question, config["retrieval_k"], q_id)
        
        # Calculate total cost including categorization if applicable
        if strategy_code in ['B', 'C']:
            total_cost = cost + (cost_tracker.cost_breakdown['query_categorization'] - q_cat_cost_before)
        else:
            total_cost = cost
            
        # Append flat dictionary for this question-strategy combination
        all_results.append({
            'question_id': q_id,
            'question_text': question,
            'strategy': strategy_code,
            'retrieved_tocs': tocs,
            'latency': latency,
            'cost': cost,
            'categorization_cost': cost_tracker.cost_breakdown['query_categorization'] - q_cat_cost_before if strategy_code in ['B', 'C'] else 0,
            'total_cost': total_cost,
            'categories': categories  # Will be None for strategies A and D
        })
    
    # Clear categorization cache after each question
    if hasattr(retriever, '_cat_cache'): 
        retriever._cat_cache.clear()

# Convert to DataFrame with one-liner
results_df = pd.DataFrame(all_results)

# Generate evaluation report
evaluator = Evaluator(golden_df, config["retrieval_k"])
results_df = evaluator.evaluate_results(results_df)

Loading document dataset from parquet file...
Available columns in parquet file: ['Part', 'Title', 'Chapter', 'Section', 'Subsection', 'Part_Heading', 'Title_Heading', 'Chapter_Heading', 'Section_Heading', 'Subsection_Heading', 'Token_Count', 'Ends_With_Dot', 'Article_Number', 'Article_Heading', 'Text', 'Text_With_Pagebreaks', 'combined_text', 'embedding']
Number of documents: 20
Converting embeddings from numpy arrays to lists...
Sample text (first 200 chars): Article_Heading: Article 1 - Scope...
Embedding dimension: 1536

Categorizing documents...


Categorizing Documents: 100%|██████████████████████████████████████████████████████████| 20/20 [00:12<00:00,  1.59it/s]


Categorized documents saved to parquet.
Using existing embeddings from dataframe...

Running retrieval strategies for all questions...


Evaluating Questions: 100%|██████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.98s/it]


In [2]:
results_df

Unnamed: 0,question_id,question_text,strategy,retrieved_tocs,latency,cost,categorization_cost,total_cost,categories,golden_tocs,matches,precision
0,Q1,What are the IRB requirements for PD models?,A,"[1.8, 2.1, 1.2, 2.4, 2.6]",435.259342,2.2e-07,0.0,2.2e-07,,"[4.1, 2.3, 2.2, 1.2, 1.1]",1,0.2
1,Q1,What are the IRB requirements for PD models?,B,"[1.1, 1.2, 1.3, 1.4, 1.5]",630.033493,0.0,2e-06,1.65e-06,"[IRB, PD]","[4.1, 2.3, 2.2, 1.2, 1.1]",2,0.4
2,Q1,What are the IRB requirements for PD models?,C,"[1.8, 2.1, 1.2, 2.4, 2.6]",230.580807,2.2e-07,2e-06,1.87e-06,"[IRB, PD]","[4.1, 2.3, 2.2, 1.2, 1.1]",1,0.2
3,Q1,What are the IRB requirements for PD models?,D,"[1.6, 1.5, 1.4, 1.3, 1.2]",700.692892,0.00154335,0.0,0.00154335,,"[4.1, 2.3, 2.2, 1.2, 1.1]",1,0.2
4,Q2,How is LGD calculated under the SA approach?,A,"[2.5, 2.1, 1.10, 1.6, 2.8]",249.431372,2.2e-07,0.0,2.2e-07,,"[4.1, 3.1, 3.2, 3.3, 1.1]",0,0.0
5,Q2,How is LGD calculated under the SA approach?,B,[1.7],533.830643,0.0,2e-06,1.65e-06,"[SA, LGD]","[4.1, 3.1, 3.2, 3.3, 1.1]",0,0.0
6,Q2,How is LGD calculated under the SA approach?,C,[1.7],215.156078,2.2e-07,2e-06,1.87e-06,"[SA, LGD]","[4.1, 3.1, 3.2, 3.3, 1.1]",0,0.0
7,Q2,How is LGD calculated under the SA approach?,D,"[1.6, 1.5, 1.4, 1.3, 1.2]",737.56671,0.00154335,0.0,0.00154335,,"[4.1, 3.1, 3.2, 3.3, 1.1]",0,0.0
8,Q3,What is the difference between PD and in-defau...,A,"[1.6, 1.2, 2.5, 1.4, 1.9]",260.808706,2.6e-07,0.0,2.6e-07,,"[1.3, 3.1, 1.2, 2.1, 1.1]",1,0.2
9,Q3,What is the difference between PD and in-defau...,B,"[1.1, 1.2, 1.3, 1.4, 1.5]",986.675024,0.0,4e-06,3.75e-06,"[IRB, In-default LGD]","[1.3, 3.1, 1.2, 2.1, 1.1]",3,0.6
