In [3]:
# Import the necessary libraries
import os
from dotenv import load_dotenv
import pandas as pd
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain.evaluation.qa import QAEvalChain

In [4]:
# Load environment variables
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [5]:
if not MONGO_URI or not GOOGLE_API_KEY:
    raise ValueError("Please ensure MONGO_URI and GOOGLE_API_KEY are set in your .env file")

print("## Setting Up the Vector Store and LLM")
print("This mirrors the setup in app_history.py")

## Setting Up the Vector Store and LLM
This mirrors the setup in app_history.py


In [6]:
# Initialize embeddings and MongoDB connection
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-exp-03-07",
    task_type="RETRIEVAL_QUERY"
)
client = MongoClient(MONGO_URI)
collection = client["bem"]["flattened_expenses_googleai"]

vector_store = MongoDBAtlasVectorSearch(
    collection=collection,
    embedding=embeddings,
    index_name="receipts_vector_index"
)

In [9]:
# Initialize LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    google_api_key=GOOGLE_API_KEY,
    temperature=0.3
)

print("\n## Creating the Retrieval Chain")
print("This recreates the same chain used in the main application")


## Creating the Retrieval Chain
This recreates the same chain used in the main application


In [10]:
# Contextual retriever chain
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

retriever_prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
    ("human", "Given the conversation history, reformulate this as a standalone query about expenses:"),
])

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, retriever_prompt
)

In [11]:
# Answer generation chain
system_prompt = """You are a smart expense assistant. Use these receipts and conversation history:

Receipts:
{context}

Conversation History:
{chat_history}"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
retrieval_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

print("\n## Sample Data for Evaluation")
print("First, let's look at what kinds of receipts we have in the database")


## Sample Data for Evaluation
First, let's look at what kinds of receipts we have in the database


In [12]:
# First, let's look at what kinds of receipts we have in the database
sample_docs = vector_store.similarity_search("show me sample receipts", k=3)
print("Sample receipt data:")
for i, doc in enumerate(sample_docs):
    print(f"\nRECEIPT {i+1}:")
    print(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content)

Sample receipt data:

RECEIPT 1:
Expense at Paragon for 2500 AED on 2025-03-11 under Meals category. Description: Meal with family.

RECEIPT 2:
Expense at KFC for 98 AED on 2025-04-10 under Meals category. Description: Farewell party

RECEIPT 3:
Expense at magnati for 24 AED on 2025-01-29 under Meals category. Description: Client lunch meeting with XYZ.


In [13]:
# Define test cases based on what's in the receipts
# Note: You should customize these based on your actual receipt data
test_cases = [
    {
        "question": "What was my most expensive purchase last month?",
        "expected_answer": "Based on the receipts, your most expensive purchase last month was...",
        "chat_history": []
    },
    {
        "question": "How much did I spend on dining in total?",
        "expected_answer": "According to the receipts, you spent a total of...",
        "chat_history": []
    },
    {
        "question": "Did I make any purchases at Target?",
        "expected_answer": "Yes, you made purchases at Target...",
        "chat_history": []
    },
    # Conversation with history
    {
        "question": "What about at Amazon?",
        "expected_answer": "Based on the receipts, you...",
        "chat_history": [HumanMessage(content="Did I make any purchases at Target?"), 
                       AIMessage(content="Yes, you made purchases at Target...")]
    },
    {
        "question": "Which category did I spend the most on?",
        "expected_answer": "Based on the receipts, you spent the most on...",
        "chat_history": []
    }
]

print("\n## Evaluation Function")
print("Creating a function to evaluate the AI's responses")


## Evaluation Function
Creating a function to evaluate the AI's responses


In [14]:
def evaluate_response(response, expected_answer):
    """Evaluate a response against an expected answer using Google's Gemini model"""
    eval_prompt = f"""
    You are evaluating the response of an AI expense assistant against an expected answer pattern.
    Please rate the response on a scale of 1-5 on these criteria:
    
    1. Relevance: Does the response directly address the question?
    2. Accuracy: Is the information provided factually correct based on the expected answer pattern?
    3. Completeness: Does the response provide a complete answer to the question?
    4. Clarity: Is the response clear and easy to understand?
    
    Expected answer pattern: {expected_answer}
    Actual response: {response}
    
    For each criterion, provide a score (1-5) and a brief explanation.
    Then give an overall score (1-5) and summary assessment.
    
    Format your response as:
    Relevance: [score] - [explanation]
    Accuracy: [score] - [explanation]
    Completeness: [score] - [explanation]
    Clarity: [score] - [explanation]
    Overall: [score] - [summary]
    """
    
    evaluation = llm.invoke(eval_prompt)
    return evaluation.content

print("\n## Run Evaluation")
print("Running the evaluation on our test cases")

results = []

for i, test_case in enumerate(test_cases):
    print(f"\n--- Evaluating Test Case {i+1} ---")
    print(f"Question: {test_case['question']}")
    
    # Generate response using the retrieval chain
    response = retrieval_chain.invoke({
        "input": test_case["question"],
        "chat_history": test_case["chat_history"]
    })
    
    actual_answer = response["answer"]
    print(f"\nAI Response: {actual_answer}")
    
    # Extract context documents
    context_docs = response["context"]
    context_summary = f"Retrieved {len(context_docs)} documents"
    
    # Evaluate the response
    evaluation = evaluate_response(actual_answer, test_case["expected_answer"])
    print(f"\nEvaluation: \n{evaluation}")
    
    # Store results
    results.append({
        "question": test_case["question"],
        "expected_answer": test_case["expected_answer"],
        "actual_answer": actual_answer,
        "evaluation": evaluation,
        "context_summary": context_summary
    })

print("\n## Results Summary")

# Extract overall scores from evaluations
scores = []
for result in results:
    eval_lines = result["evaluation"].split('\n')
    overall_line = [line for line in eval_lines if line.startswith("Overall:")]
    if overall_line:
        score_text = overall_line[0].split('-')[0].replace("Overall:", "").strip()
        try:
            score = float(score_text)
            scores.append(score)
        except ValueError:
            print(f"Could not parse score from: {overall_line[0]}")
            scores.append(None)
    else:
        scores.append(None)


## Run Evaluation
Running the evaluation on our test cases

--- Evaluating Test Case 1 ---
Question: What was my most expensive purchase last month?

AI Response: Last month (April 2025) your most expensive purchase was office supplies from NASCO for 400 AED.

Evaluation: 
Relevance: 5 - The response directly addresses the question of the most expensive purchase last month.

Accuracy: 5 - Assuming the provided information about the NASCO purchase is correct, the response accurately identifies the most expensive purchase.

Completeness: 5 - The response provides the month, vendor, item purchased, and cost, which constitutes a complete answer to the question.

Clarity: 5 - The response is clear, concise, and easy to understand.  It uses simple language and provides all the necessary information in a straightforward manner.

Overall: 5 - The AI assistant's response is excellent. It accurately and completely answers the question in a clear and concise way, perfectly matching the expected 

In [15]:
# Create a summary DataFrame
summary_df = pd.DataFrame({
    "Question": [result["question"] for result in results],
    "Overall Score": scores,
})

In [16]:
# Display summary statistics
print("\n--- Evaluation Summary ---")
print(f"Average Score: {sum(filter(None, scores))/len(list(filter(None, scores)))}")
print(f"Best Score: {max(filter(None, scores))}")
print(f"Worst Score: {min(filter(None, scores))}")
print("\nScores by Question:")
print(summary_df)

print("\n## Detailed Analysis")
print("Analyzing the performance for each evaluation criterion")


--- Evaluation Summary ---
Average Score: 4.0
Best Score: 5.0
Worst Score: 3.0

Scores by Question:
                                          Question  Overall Score
0  What was my most expensive purchase last month?            5.0
1         How much did I spend on dining in total?            4.0
2              Did I make any purchases at Target?            3.0
3                            What about at Amazon?            3.0
4          Which category did I spend the most on?            5.0

## Detailed Analysis
Analyzing the performance for each evaluation criterion


In [17]:
# Extract all criteria scores
criteria_scores = {
    "Relevance": [],
    "Accuracy": [],
    "Completeness": [],
    "Clarity": []
}

for result in results:
    eval_lines = result["evaluation"].split('\n')
    
    for criterion in criteria_scores.keys():
        criterion_line = [line for line in eval_lines if line.startswith(f"{criterion}:")]
        if criterion_line:
            score_text = criterion_line[0].split('-')[0].replace(f"{criterion}:", "").strip()
            try:
                score = float(score_text)
                criteria_scores[criterion].append(score)
            except ValueError:
                print(f"Could not parse {criterion} score from: {criterion_line[0]}")
                criteria_scores[criterion].append(None)
        else:
            criteria_scores[criterion].append(None)

In [18]:
# Create a summary of criterion averages
criterion_averages = {}
for criterion, scores in criteria_scores.items():
    filtered_scores = list(filter(None, scores))
    if filtered_scores:
        criterion_averages[criterion] = sum(filtered_scores) / len(filtered_scores)
    else:
        criterion_averages[criterion] = None

print("\n--- Performance by Criterion ---")
for criterion, avg in criterion_averages.items():
    print(f"{criterion}: {avg:.2f}")

print("\n## Recommendations for Improvement")
print("Based on the evaluation results, here are some recommendations for improving the AI Expense Assistant:")

# Find the lowest-scoring criterion
lowest_criterion = min(criterion_averages.items(), key=lambda x: x[1] if x[1] is not None else float('inf'))

# Find the lowest-scoring question
lowest_question_idx = scores.index(min(filter(None, scores)))
lowest_question = results[lowest_question_idx]["question"]
lowest_answer = results[lowest_question_idx]["actual_answer"]

print(f"\n--- Areas for Improvement ---")
print(f"Lowest scoring criterion: {lowest_criterion[0]} (avg: {lowest_criterion[1]:.2f})")
print(f"\nLowest scoring question: '{lowest_question}'")
print(f"Response: '{lowest_answer}'")

improvement_prompt = f"""
Based on the evaluation results, please provide specific recommendations for improving the AI Expense Assistant. Focus on:

1. The lowest scoring criterion: {lowest_criterion[0]} (avg: {lowest_criterion[1]:.2f})
2. The lowest scoring question: '{lowest_question}' 
3. Ways to improve the response: '{lowest_answer}'
4. General recommendations for the system as a whole

Format your response as specific, actionable recommendations.
"""

recommendations = llm.invoke(improvement_prompt)
print(f"\n--- Recommendations ---\n{recommendations.content}")

print("\n## Context Analysis")
print("Looking at how well the system is retrieving relevant context")

# Run a detailed analysis of retrieved context for the first test case
test_response = retrieval_chain.invoke({
    "input": test_cases[0]["question"],
    "chat_history": test_cases[0]["chat_history"]
})

print(f"Question: {test_cases[0]['question']}")
print(f"\nRetrieved {len(test_response['context'])} documents")

context_analysis_prompt = f"""
Analyze how relevant the retrieved context is to the question: '{test_cases[0]['question']}'

Context documents retrieved:
{[doc.page_content[:200] + '...' for doc in test_response['context']]}

For each document snippet, rate its relevance to the question on a scale of 1-5 (5 being most relevant).
Then provide an overall assessment of the retrieval quality and specific suggestions for improvement.
"""

context_analysis = llm.invoke(context_analysis_prompt)
print(f"\nContext Relevance Analysis:\n{context_analysis.content}")

print("\n## Conclusion")
print("This evaluation has assessed the AI Expense Assistant on multiple criteria across various test cases.")
print("Based on the results, we've identified strengths, weaknesses, and areas for improvement in the system.")


--- Performance by Criterion ---
Relevance: 4.20
Accuracy: 4.00
Completeness: 4.00
Clarity: 5.00

## Recommendations for Improvement
Based on the evaluation results, here are some recommendations for improving the AI Expense Assistant:

--- Areas for Improvement ---
Lowest scoring criterion: Accuracy (avg: 4.00)

Lowest scoring question: 'What was my most expensive purchase last month?'
Response: 'Last month (April 2025) your most expensive purchase was office supplies from NASCO for 400 AED.'

--- Recommendations ---
## Recommendations for Improving the AI Expense Assistant

**1. Improving Accuracy (Avg: 4.00)**

* **Data Integration and Validation:** Implement robust integration with all relevant expense data sources (bank accounts, credit cards, expense reports).  Validate data integrity regularly to ensure the system operates on accurate and up-to-date information.  Consider automated data cleaning processes to handle inconsistencies or missing values.
* **Transaction Categorizati