In [19]:
# --- Setup and Imports ---
import os
import sys
import pandas as pd
import json
import whisper
from openai import OpenAI 
import toml
import time
from jiwer import wer # New library for WER calculation
import re

# --- Configuration ---
TEST_DIR = "../data/test_audio"
GROUND_TRUTH_FILE = "../data/ground_truth.csv"
OUTPUT_FILE = "final_evaluation_results.csv" 

# Load the model and client 
try:
    secrets = toml.load("../.streamlit/secrets.toml")
    DEEPSEEK_API_KEY = secrets.get("DEEPSEEK_API_KEY") # Use .get for safety
    if not DEEPSEEK_API_KEY:
        raise KeyError("DEEPSEEK_API_KEY not set")
except KeyError as e:
    print(f"FATAL ERROR: API Key not found. Please check {e}")
    sys.exit()

llm_client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")

# Load Whisper Model (Same as the app, using 'small' model)
print("Loading Whisper Model...")
model = whisper.load_model("small")


Loading Whisper Model...


In [25]:
# --- Helper Functions ---

def calculate_accuracy(expected_json_str, actual_json_str):
    """ Calculates Final JSON Accuracy and Slot Accuracy (Metrics 2 & 3). """
    
    # 1. Clean and Parse Actual Output (Model Output)
    actual_json_str = actual_json_str.strip()
    if actual_json_str.startswith('"') and actual_json_str.endswith('"'):
        actual_json_str = actual_json_str.strip('"')
    
    # 2. Clean and Parse Expected Output (Ground Truth)
    cleaned_expected_str = expected_json_str.strip().replace('\\"', '"').strip('\'" \n\t')
    
    try:
        expected = json.loads(cleaned_expected_str) 
        actual = json.loads(actual_json_str)
    except json.JSONDecodeError as e:
        return 0.0, 0.0, f"FAIL: Invalid JSON Output ({e})" # Returns 0.0 for both metrics
    except Exception as e:
         return 0.0, 0.0, f"FAIL: Unknown Error ({e})"


    # --- Comparison Logic ---
    if len(expected) != len(actual):
        return 0.0, 0.0, "FAIL: Length Mismatch"
    
    expected_sorted = sorted(expected, key=lambda x: x.get('item', ''))
    actual_sorted = sorted(actual, key=lambda x: x.get('item', ''))

    correct_fields = 0
    total_fields = 0
    
    for e, a in zip(expected_sorted, actual_sorted):
        total_fields += 3 # item, amount, category

        # 1. Item and Category comparison (string comparison)
        item_match = (e.get('item') == a.get('item'))
        category_match = (e.get('category') == a.get('category'))
        
        item_match = (e.get('item', '').lower() == a.get('item', '').lower())
        category_match = (e.get('category', '').lower() == a.get('category', '').lower())
        
        if item_match: correct_fields += 1
        if category_match: correct_fields += 1
        
        # 2. Amount comparison (Float Rounding Fix)
        try:
            e_amount = round(float(e.get('amount', 0)), 2)
            a_amount = round(float(a.get('amount', 0)), 2)
        except ValueError:
             continue # If float conversion fails, skip amount field

        amount_match = (e_amount == a_amount)
        if amount_match: correct_fields += 1
    
    # Calculate Metrics
    final_json_accuracy = 1.0 if (correct_fields == total_fields) else 0.0
    slot_accuracy = correct_fields / total_fields if total_fields > 0 else 1.0
    
    status_msg = "PASS: Perfect Match" if final_json_accuracy == 1.0 else "FAIL: Partial Match"

    return final_json_accuracy, slot_accuracy, status_msg


def extract_expenses_llm(text):
    """ Sends text to DeepSeek for JSON extraction (same prompt as app.py). """
    categories_list = ["Food", "Transport", "Utilities", "Retail", "Entertainment", "Personal Care", "Other"]
    system_prompt = f"""
    You are an expert expense tracking assistant.
    Your task is to extract item, amount, and category from the user's input.
    You MUST choose a category from this list: {', '.join(categories_list)}.
    If the expense is clear, such as "Electronics" or "Gifts", but is not in the list, use 'Retail'.
    If an appropriate category is not found, use 'Other'.

    Before outputting the final JSON, follow these steps:
    1. Analyze the user's speech and correct any obvious transcription errors (e.g., 'black for instance' -> 'breakfast').
    2. Extract all transactions.
    3. Output the result ONLY in the specified JSON format.
    
    Here are examples (Few-Shot Learning):
    Input: "I took a taxi for 15.50 and grabbed a snack for 12.00."
    Output: [{{"item": "Taxi", "amount": 15.50, "category": "Transport"}}, {{"item": "Snack", "amount": 12.00, "category": "Food"}}]

    Input: "Paid my electricity bill, it was 88 dollars."
    Output: [{{"item": "Electricity Bill", "amount": 88.0, "category": "Bills"}}]

    Input: "I didn't spend anything today, just went home."
    Output: []

    Now, process the user's input below.
    Output ONLY the raw, valid JSON list based on the examples. DO NOT include any introductory text, concluding remarks, or markdown code blocks (like ```json).
    """

    try:
        response = llm_client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text},
            ],
            temperature=0.1
        )
        # Clean LLM response using the logic added in the previous step
        json_str = response.choices[0].message.content
        if json_str.startswith("```"):
            json_str = re.sub(r"```(json|JSON)?", "", json_str).strip()
        
        return json_str
    except Exception as e:
        return "[]"

In [26]:
# --- Main Evaluation Loop ---

# Load Ground Truth Data (using pipe separator)
# Note: Ensure the pd.read_csv call includes sep='|' in the final notebook setup.
gt_df = pd.read_csv(GROUND_TRUTH_FILE, sep='|', encoding='utf-8', on_bad_lines='skip') 
test_results = []

print(f"Starting evaluation of {len(gt_df)} cases...")


for index, row in gt_df.iterrows():
    filename = row['filename']
    audio_path = os.path.join(TEST_DIR, filename)
    
    print(f"\n[{index+1}/{len(gt_df)}] Processing {filename}...")
    
    if not os.path.exists(audio_path):
        print(f"ERROR: Audio file not found at {audio_path}")
        continue
    
    start_time = time.time()
    
    # --- 1. Run Pipeline ---
    try:
        # Transcribe (Whisper)
        transcript_result = model.transcribe(audio_path)
        transcript = transcript_result["text"].strip()
        
        # Extract (DeepSeek)
        actual_json_str = extract_expenses_llm(transcript)
    
    except Exception as e:
        transcript = f"TRANSCRIPTION_ERROR: {e}"
        actual_json_str = "[]"
    
    end_time = time.time()
    
    # --- 2. Calculate Metrics ---
    
    # Calculate Final and Slot Accuracy (Metrics 2 & 3)
    final_acc, slot_acc, status_msg = calculate_accuracy(row['expected_json'], actual_json_str)

    # --- 3. Save Results ---
    test_results.append({
        "Case ID": index + 1,
        "Audio File": filename,
        "Status": status_msg,
        "Latency (s)": round(end_time - start_time, 2), 
        "Final JSON Acc": final_acc, 
        "Slot Acc": slot_acc, 
        "Transcript": transcript,
        "Actual JSON": actual_json_str, 
        "json_expected": row['expected_json'],  
        "Notes": row['evaluation_notes']
})

# --- Final Output and Analysis Setup ---
final_df = pd.DataFrame(test_results)
final_df.to_csv(OUTPUT_FILE, index=False)

print("\n\n--- Evaluation Summary ---")
# Global Averages
avg_latency = final_df['Latency (s)'].mean()
avg_final_acc = final_df['Final JSON Acc'].mean()
avg_slot_acc = final_df['Slot Acc'].mean()

print(f"Total Cases: {len(final_df)}")
print(f"Average System Latency (s): {avg_latency:.2f} ")
print(f"Average Final JSON Accuracy: {avg_final_acc:.2%} ")
print(f"Average Slot Accuracy: {avg_slot_acc:.2%} ")


# --- Display Final Table  ---

pd.set_option('display.max_colwidth', None)

pd.set_option('display.max_rows', 500) 

display_cols = ["Case ID", "Final JSON Acc", "Slot Acc", "Transcript", "Actual JSON", "json_expected", "Latency (s)", "Status", "Notes"]
display(final_df[display_cols])

# Display failures for quick analysis (Qualitative Check)
print("\n--- Failed Cases ---")
display(final_df[final_df['Final JSON Acc'] < 1.0][display_cols])

Starting evaluation of 20 cases...

[1/20] Processing 01.m4a...





[2/20] Processing 02.m4a...





[3/20] Processing 03.m4a...





[4/20] Processing 04.m4a...





[5/20] Processing 05.m4a...





[6/20] Processing 06.m4a...





[7/20] Processing 07.m4a...





[8/20] Processing 08.m4a...





[9/20] Processing 09.m4a...





[10/20] Processing 10.m4a...





[11/20] Processing 11.m4a...





[12/20] Processing 12.m4a...





[13/20] Processing 13.m4a...





[14/20] Processing 14.m4a...





[15/20] Processing 15.m4a...





[16/20] Processing 16.m4a...





[17/20] Processing 17.m4a...





[18/20] Processing 18.m4a...





[19/20] Processing 19.m4a...





[20/20] Processing 20.m4a...






--- Evaluation Summary ---
Total Cases: 20
Average System Latency (s): 4.13 
Average Final JSON Accuracy: 75.00% 
Average Slot Accuracy: 80.83% 


Unnamed: 0,Case ID,Final JSON Acc,Slot Acc,Transcript,Actual JSON,json_expected,Latency (s),Status,Notes
0,1,1.0,1.0,The lunch was $25.,"[{""item"": ""Lunch"", ""amount"": 25.0, ""category"": ""Food""}]","[{""item"": ""Lunch"", ""amount"": 25.00, ""category"": ""Food""}]",7.66,PASS: Perfect Match,Baseline: Simple single item.
1,2,1.0,1.0,"I grabbed two things, $5 on coffee and then I paid $50 for the weeks groceries.","[{""item"": ""Coffee"", ""amount"": 5.0, ""category"": ""Food""}, {""item"": ""Groceries"", ""amount"": 50.0, ""category"": ""Food""}]","[{""item"": ""Coffee"", ""amount"": 5.00, ""category"": ""Food""}, {""item"": ""Groceries"", ""amount"": 50.00, ""category"": ""Food""}]",3.9,PASS: Perfect Match,Clear multi-item transaction.
2,3,1.0,1.0,"Today I had breakfast, it was $5. Lunch was $12 and then I grabbed a coffee which was around $5.","[\n {""item"": ""Breakfast"", ""amount"": 5.0, ""category"": ""Food""},\n {""item"": ""Lunch"", ""amount"": 12.0, ""category"": ""Food""},\n {""item"": ""Coffee"", ""amount"": 5.0, ""category"": ""Food""}\n]","[{""item"": ""Breakfast"", ""amount"": 5.00, ""category"": ""Food""}, {""item"": ""Lunch"", ""amount"": 12.00, ""category"": ""Food""}, {""item"": ""Coffee"", ""amount"": 5.00, ""category"": ""Food""}]",4.48,PASS: Perfect Match,Clear multi-item transaction.
3,4,1.0,1.0,My train ticket cost $33.75 and my bus pass was $50.,"[{""item"": ""Train Ticket"", ""amount"": 33.75, ""category"": ""Transport""}, {""item"": ""Bus Pass"", ""amount"": 50.0, ""category"": ""Transport""}]","[{""item"": ""Train Ticket"", ""amount"": 33.75, ""category"": ""Transport""}, {""item"": ""Bus Pass"", ""amount"": 50.00, ""category"": ""Transport""}]",4.19,PASS: Perfect Match,Complex numbers and two items.
4,5,1.0,1.0,I bought a new computer mouse for 60 bucks and the keyboard for 89 dollars.,"[{""item"": ""Computer Mouse"", ""amount"": 60.0, ""category"": ""Retail""}, {""item"": ""Keyboard"", ""amount"": 89.0, ""category"": ""Retail""}]","[{""item"": ""Computer Mouse"", ""amount"": 60.00, ""category"": ""Retail""}, {""item"": ""Keyboard"", ""amount"": 89.00, ""category"": ""Retail""}]",4.38,PASS: Perfect Match,"Multiple items, testing 'bucks' unit."
5,6,1.0,1.0,"It's cold outside, I hope it doesn't snow tomorrow. I just want to finish this project.",[],[],2.73,PASS: Perfect Match,Negative test: No spending mentioned.
6,7,1.0,1.0,I paid my friend back $10 for the movie ticket and I still owe her 5 more for gas.,"[{""item"": ""Movie Ticket"", ""amount"": 10.0, ""category"": ""Entertainment""}, {""item"": ""Gas"", ""amount"": 5.0, ""category"": ""Transport""}]","[{""item"": ""Movie Ticket"", ""amount"": 10.00, ""category"": ""Entertainment""}, {""item"": ""Gas"", ""amount"": 5.00, ""category"": ""Transport""}]",3.96,PASS: Perfect Match,Testing multiple categories in one go.
7,8,0.0,0.833333,The groceries cost $103 and later I spent $9 on smoothies.,"[{""item"": ""Groceries"", ""amount"": 103.0, ""category"": ""Food""}, {""item"": ""Smoothies"", ""amount"": 9.0, ""category"": ""Food""}]","[{""item"": ""Groceries"", ""amount"": 103.00, ""category"": ""Food""}, {""item"": ""Smoothie"", ""amount"": 9.00, ""category"": ""Food""}]",4.65,FAIL: Partial Match,"Long sentence, two items, large numbers."
8,9,1.0,1.0,"I spent nothing today, absolutely zero, but I did transfer $50 to my savings account.",[],[],2.62,PASS: Perfect Match,"Negative test: Transfer, not expense."
9,10,1.0,1.0,"10 bucks for a beer and a burger, $5 for a subway and $30 for a gift.","[{""item"": ""Beer and Burger"", ""amount"": 10.0, ""category"": ""Food""}, {""item"": ""Subway"", ""amount"": 5.0, ""category"": ""Transport""}, {""item"": ""Gift"", ""amount"": 30.0, ""category"": ""Retail""}]","[{""item"": ""Beer and Burger"", ""amount"": 10.00, ""category"": ""Food""}, {""item"": ""Subway"", ""amount"": 5.00, ""category"": ""Transport""}, {""item"": ""Gift"", ""amount"": 30.00, ""category"": ""Retail""}]",4.54,PASS: Perfect Match,Mixing units (bucks/dollars).



--- Failed Cases ---


Unnamed: 0,Case ID,Final JSON Acc,Slot Acc,Transcript,Actual JSON,json_expected,Latency (s),Status,Notes
7,8,0.0,0.833333,The groceries cost $103 and later I spent $9 on smoothies.,"[{""item"": ""Groceries"", ""amount"": 103.0, ""category"": ""Food""}, {""item"": ""Smoothies"", ""amount"": 9.0, ""category"": ""Food""}]","[{""item"": ""Groceries"", ""amount"": 103.00, ""category"": ""Food""}, {""item"": ""Smoothie"", ""amount"": 9.00, ""category"": ""Food""}]",4.65,FAIL: Partial Match,"Long sentence, two items, large numbers."
13,14,0.0,0.0,What a new book and a prick of gum today at the campus store.,"[{""item"": ""Book"", ""amount"": 0.0, ""category"": ""Retail""}, {""item"": ""Pack of gum"", ""amount"": 0.0, ""category"": ""Food""}]",[],3.48,FAIL: Length Mismatch,"Negative test: No amount mentioned, should return empty."
15,16,0.0,0.0,I got a hair cut for $32 and a tip for fire.,"[{""item"": ""Hair cut"", ""amount"": 32.0, ""category"": ""Personal Care""}]","[{""item"": ""Hair Cut"", ""amount"": 32.00, ""category"": ""Personal Care""}, {""item"": ""Tip"", ""amount"": 5.00, ""category"": ""Other""}]",3.69,FAIL: Length Mismatch,OOD (whispering/gasp) test.
17,18,0.0,0.166667,My favorite was $6 and my dinner cost $20.,"[{""item"": ""Favorite"", ""amount"": 6.0, ""category"": ""Other""}, {""item"": ""Dinner"", ""amount"": 20.0, ""category"": ""Food""}]","[{""item"": ""Coffee"", ""amount"": 6.00, ""category"": ""Food""}, {""item"": ""Dinner"", ""amount"": 20.00, ""category"": ""Food""}]",3.41,FAIL: Partial Match,Noisy background test.
18,19,0.0,0.166667,I bought a pierce high phone for $70 and mine for $15.,"[{""item"": ""Pierce High Phone"", ""amount"": 70.0, ""category"": ""Retail""}, {""item"": ""Mine"", ""amount"": 15.0, ""category"": ""Retail""}]","[{""item"": ""Headphones"", ""amount"": 70.00, ""category"": ""Retail""}, {""item"": ""Lunch"", ""amount"": 15.00, ""category"": ""Food""}]",5.22,FAIL: Partial Match,Noisy background test.
