In [1]:
# Import necessary modules
import os
from dotenv import load_dotenv
import sys
from langchain.evaluation.qa import QAEvalChain
from langchain_core.messages import HumanMessage, AIMessage

In [2]:
# Import components from app_history.py
# We'll import directly from your original file to ensure we're using the same components
# Make sure app_history.py is in the same directory or in your Python path
try:
    # Try to import without modifying the file
    sys.path.append('.')  # Add current directory to path
    from app_history import (
        llm, retrieval_chain, vector_store
    )
    print("✅ Successfully imported components from app_history.py")
except ImportError:
    # If direct import fails, we'll recreate the necessary components
    print("⚠️ Could not import directly from app_history.py")
    print("Recreating necessary components...")
    
    # Load environment variables
    load_dotenv()
    MONGO_URI = os.getenv("MONGO_URI")
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    
    if not MONGO_URI or not GOOGLE_API_KEY:
        raise ValueError("Please ensure MONGO_URI and GOOGLE_API_KEY are set in your .env file")
    
    # Recreate the components from app_history.py
    from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
    from langchain_mongodb import MongoDBAtlasVectorSearch
    from pymongo import MongoClient
    from langchain.chains import create_retrieval_chain, create_history_aware_retriever
    from langchain.chains.combine_documents import create_stuff_documents_chain
    from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
    
    # Initialize embeddings and MongoDB connection
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/gemini-embedding-exp-03-07",
        task_type="RETRIEVAL_QUERY"
    )
    client = MongoClient(MONGO_URI)
    collection = client["bem"]["flattened_expenses_googleai"]
    
    vector_store = MongoDBAtlasVectorSearch(
        collection=collection,
        embedding=embeddings,
        index_name="receipts_vector_index"
    )
    
    # Initialize LLM
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-pro",
        google_api_key=GOOGLE_API_KEY,
        temperature=0.3
    )
    
    # Contextual retriever chain
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    
    retriever_prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
        ("human", "Given the conversation history, reformulate this as a standalone query about expenses:"),
    ])
    
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, retriever_prompt
    )
    
    # Answer generation chain
    system_prompt = """You are a smart expense assistant. Use these receipts and conversation history:
    
    Receipts:
    {context}
    
    Conversation History:
    {chat_history}"""
    
    qa_prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ])
    
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
    retrieval_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
    
    print("✅ Successfully recreated components")

2025-05-08 10:55:39.055 
  command:

    streamlit run c:\Users\akash\OneDrive\Desktop\Agents\agents\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-08 10:55:39.058 Session state does not function when running a script without `streamlit run`


✅ Successfully imported components from app_history.py


In [3]:
print("\n--- Exploring the expense data ---")
# Sample a few receipts to understand what data we have
sample_docs = vector_store.similarity_search("show sample receipts", k=2)
print(f"Found {len(sample_docs)} sample receipts")
print("Sample receipt excerpt:")
if sample_docs:
    print(sample_docs[0].page_content[:300] + "...")


--- Exploring the expense data ---
Found 2 sample receipts
Sample receipt excerpt:
Expense at Paragon for 2500 AED on 2025-03-11 under Meals category. Description: Meal with family....


In [4]:
print("\n--- Defining test cases ---")
# These should be customized based on your actual receipt data
question_answers = [
    {
        'question': "What was my most expensive purchase last month?",
        'answer': 'The most expensive purchase last month was X for $Y.',
        'chat_history': []
    },
    {
        'question': "How much did I spend on dining?",
        'answer': 'You spent $X on dining expenses.',
        'chat_history': []
    },
    {
        'question': "Do I have any receipts from Target?",
        'answer': 'Yes/No, you have X receipts from Target.',
        'chat_history': []
    },
    {
        'question': "What about Amazon?",
        'answer': 'You have X purchases from Amazon.',
        'chat_history': [
            HumanMessage(content="Do I have any receipts from Target?"),
            AIMessage(content="Yes, you have several receipts from Target.")
        ]
    },
    {
        'question': "Which category did I spend the most on?",
        'answer': 'You spent the most on X category, totaling $Y.',
        'chat_history': []
    }
]

print(f"Created {len(question_answers)} test cases")


--- Defining test cases ---
Created 5 test cases


In [5]:
# Generate predictions using the retrieval chain
print("\n--- Generating answers ---")
predictions = []

for i, qa_pair in enumerate(question_answers):
    print(f"Processing question {i+1}: {qa_pair['question']}")
    
    # Get prediction from the retrieval chain
    response = retrieval_chain.invoke({
        "input": qa_pair["question"],
        "chat_history": qa_pair.get("chat_history", [])
    })
    
    # Extract the answer
    prediction = {"question": qa_pair["question"], "result": response["answer"]}
    predictions.append(prediction)
    
    print(f"Answer: {prediction['result'][:100]}...")


--- Generating answers ---
Processing question 1: What was my most expensive purchase last month?
Answer: Last month (April 2025) your most expensive purchase was office supplies from NASCO for 400 AED....
Processing question 2: How much did I spend on dining?
Answer: You spent a total of 2782 AED on dining....
Processing question 3: Do I have any receipts from Target?
Answer: No, based on the receipts I have, there are no expenses recorded from Target.  I have receipts from ...
Processing question 4: What about Amazon?
Answer: I don't see any receipts from Amazon in your expense history.  Would you like me to check for someth...
Processing question 5: Which category did I spend the most on?
Answer: Your top spending category is Meals, totaling 2658 AED....


In [6]:
# Create an evaluation chain using LLM
print("\n--- Evaluating answers ---")
eval_chain = QAEvalChain.from_llm(llm)

# Grade the predictions
graded_outputs = eval_chain.evaluate(
    question_answers, 
    predictions,
    question_key="question",
    prediction_key="result",
    answer_key="answer"
)


--- Evaluating answers ---


In [10]:
# Print the graded outputs safely
print("\n--- Evaluation Results ---")
for i, (qa_pair, prediction, graded_output) in enumerate(zip(question_answers, predictions, graded_outputs)):
    print(f"\nQuestion {i+1}: {qa_pair['question']}")
    print(f"Expected: {qa_pair['answer']}")
    print(f"Actual: {prediction['result'][:100]}..." if len(prediction['result']) > 100 else f"Actual: {prediction['result']}")
    
    # Try to extract evaluation text from possible keys
    eval_text = graded_output.get("text") or graded_output.get("result") or str(graded_output)
    print(f"Evaluation: {eval_text}")
    print("-" * 80)



--- Evaluation Results ---

Question 1: What was my most expensive purchase last month?
Expected: The most expensive purchase last month was X for $Y.
Actual: Last month (April 2025) your most expensive purchase was office supplies from NASCO for 400 AED.
Evaluation: {'results': 'GRADE: INCORRECT'}
--------------------------------------------------------------------------------

Question 2: How much did I spend on dining?
Expected: You spent $X on dining expenses.
Actual: You spent a total of 2782 AED on dining.
Evaluation: {'results': 'GRADE: INCORRECT'}
--------------------------------------------------------------------------------

Question 3: Do I have any receipts from Target?
Expected: Yes/No, you have X receipts from Target.
Actual: No, based on the receipts I have, there are no expenses recorded from Target.  I have receipts from ...
Evaluation: {'results': 'GRADE: INCORRECT'}
--------------------------------------------------------------------------------

Question 4: What a

In [11]:
# Calculate success rate
correct_answers = sum(
    1 for output in graded_outputs
    if "correct" in (output.get("text") or output.get("result") or str(output)).lower()
)
total_questions = len(graded_outputs)
success_rate = correct_answers / total_questions if total_questions > 0 else 0

print(f"\n--- Summary ---")
print(f"Total questions: {total_questions}")
print(f"Correct answers: {correct_answers}")
print(f"Success rate: {success_rate:.2%}")



--- Summary ---
Total questions: 5
Correct answers: 5
Success rate: 100.00%


In [12]:
feedback_prompt = f"""
You are evaluating an AI expense assistant that answered {total_questions} questions with a success rate of {success_rate:.2%}.

Here are the questions and evaluations:
{[{
  "question": qa["question"],
  "expected": qa["answer"],
  "actual": pred["result"],
  "evaluation": (grade.get("text") or grade.get("result") or str(grade))
} for qa, pred, grade in zip(question_answers, predictions, graded_outputs)]}

Based on these results, provide:
1. Overall assessment of the assistant's performance
2. Specific strengths observed
3. Common error patterns or weaknesses
4. Recommendations for improving the assistant

Be specific and actionable in your recommendations.
"""


In [13]:
feedback = llm.invoke(feedback_prompt)
print(feedback.content)

## Overall Assessment

Despite a reported 100% success rate, the AI expense assistant performed poorly.  All five evaluated questions were marked "INCORRECT" according to the provided evaluations. This discrepancy suggests a serious flaw in the automatic evaluation process.  The assistant demonstrates some understanding of the questions and provides relevant information, but it fails to consistently meet the expected response format or provide entirely accurate information.

## Specific Strengths Observed

* **Understanding User Intent:** The assistant correctly interprets the intent behind each question. It understands queries about highest expenses, specific vendor expenses, and category spending.
* **Information Retrieval:**  The assistant seems to access and process expense data, retrieving relevant information like vendor names, amounts, and categories.
* **Natural Language Generation:** The assistant generates human-readable responses that are generally well-structured and gramma