In [None]:
# --- Setup and Imports ---
import os
import sys
import pandas as pd
import json
import whisper
from openai import OpenAI 
import toml
import time
from tqdm import tqdm 
import re

# --- Configuration ---
TEST_AUDIO_DIR = "../data/test_audio"
GROUND_TRUTH_FILE = "../data/ground_truth.csv" # For Audio
SYNTHETIC_DATA_FILE = "../data/synthetic_test_data.csv" # For Text
OUTPUT_FILE = "final_evaluation_results.csv" 

# Load Secrets
try:
    secrets = toml.load("../.streamlit/secrets.toml")
    DEEPSEEK_API_KEY = secrets.get("DEEPSEEK_API_KEY")
    if not DEEPSEEK_API_KEY:
        raise KeyError("DEEPSEEK_API_KEY not set")
except KeyError as e:
    print(f"FATAL ERROR: API Key not found. Please check {e}")
    sys.exit()

llm_client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")

# Load Whisper Model 
print("Loading Whisper Model...")
model = whisper.load_model("small")


Loading Whisper Model...


In [None]:
# --- Helper Functions ---
# AI generated: google gemini 2
def calculate_accuracy(expected_json_str, actual_json_str):
    """ 
    Calculates:
    1. Final JSON Accuracy 
    2. Slot Accuracy 
    3. Category Accuracy 
    4. Status Message
    """
    
    # --- 1. Clean Parsing ---
    actual_json_str = actual_json_str.strip()
    if actual_json_str.startswith('"') and actual_json_str.endswith('"'):
        actual_json_str = actual_json_str.strip('"')
    
    cleaned_expected_str = expected_json_str.strip().replace('\\"', '"').strip('\'" \n\t')
    
    try:
        expected = json.loads(cleaned_expected_str) 
        actual = json.loads(actual_json_str)
    except:
        return 0.0, 0.0, 0.0, "FAIL: Invalid JSON" 

    if len(expected) != len(actual):
        return 0.0, 0.0, 0.0, "FAIL: Length Mismatch"
    
    # --- 2. Sorting ---
    expected_sorted = sorted(expected, key=lambda x: x.get('item', '').lower())
    actual_sorted = sorted(actual, key=lambda x: x.get('item', '').lower())

    correct_fields = 0
    total_fields = 0
    correct_categories = 0
    total_categories = 0
    
    for e, a in zip(expected_sorted, actual_sorted):
        total_fields += 3 
        total_categories += 1 
        
        e_item = e.get('item', '').strip().lower()
        a_item = a.get('item', '').strip().lower()
        e_cat = e.get('category', '').strip().lower()
        a_cat = a.get('category', '').strip().lower()

        if e_item == a_item: correct_fields += 1
        
        # Category Check
        if e_cat == a_cat: 
            correct_fields += 1
            correct_categories += 1
        
        try:
            e_amt = round(float(e.get('amount', 0)), 2)
            a_amt = round(float(a.get('amount', 0)), 2)
            if e_amt == a_amt: correct_fields += 1
        except: pass
    
    # Metrics
    final_json_accuracy = 1.0 if (correct_fields == total_fields) else 0.0
    slot_accuracy = correct_fields / total_fields if total_fields > 0 else 1.0
    category_accuracy = correct_categories / total_categories if total_categories > 0 else 1.0
    
    status_msg = "PASS" if final_json_accuracy == 1.0 else "FAIL"

    return final_json_accuracy, slot_accuracy, category_accuracy, status_msg


def extract_expenses_llm(text):
    """ Sends text to DeepSeek for JSON extraction using the Final Prompt. """
    
    # Hardcoded list for evaluation consistency
    categories_list = ["Food", "Transport", "Utilities", "Retail", "Entertainment", "Personal Care", "Other"]
    
    system_prompt = f"""
    You are an expert expense tracking assistant.
    Your task is to extract item, amount, and category from the user's input.

    # --- 1. CRITICAL GUARDRAILS (Safety & Scope) ---
    DO NOT respond to any queries that are off-topic, political, personal advice, harmful, violent, explicit, or non-expense related. 
    If the input is toxic or not about expenses (e.g., "how to make a bomb", "I hate people"), you MUST output an empty JSON list: [].

    # --- 2. CATEGORIZATION RULES ---
    You MUST choose a category from this list: {', '.join(categories_list)}.
    
    Specific Rules:
    - Groceries, Supermarket runs, Snacks, and Dining out should be classified as 'Food'.
    - Clothing, Electronics, Household items, and Gifts should be classified as 'Retail'.
    - If the expense is clear but not in the list, use 'Retail'.
    - If an appropriate category is not found, use 'Other'.

    # --- 3. EXTRACTION STEPS ---
    1. Analyze the user's speech and correct any obvious transcription errors (e.g., 'black for instance' -> 'breakfast', 'three fiddy' -> 3.50).
    2. Extract all transactions.
    3. Output the result ONLY in the specified JSON format.
    
    # --- 4. FEW-SHOT EXAMPLES ---
    Input: "I took a taxi for 15.50 and grabbed a snack for 12.00."
    Output: [{{"item": "Taxi", "amount": 15.50, "category": "Transport"}}, {{"item": "Snack", "amount": 12.00, "category": "Food"}}]

    Input: "Paid my electricity bill, it was 88 dollars."
    Output: [{{"item": "Electricity Bill", "amount": 88.0, "category": "Utilities"}}]

    Input: "I bought some groceries for 50 bucks and a new shirt for 30."
    Output: [{{"item": "Groceries", "amount": 50.00, "category": "Food"}}, {{"item": "Shirt", "amount": 30.00, "category": "Retail"}}]

    Input: "I didn't spend anything today, just went home."
    Output: []

    Now, process the user's input below.
    Output ONLY the raw, valid JSON list based on the examples. DO NOT include any introductory text.
    """

    try:
        response = llm_client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text},
            ],
            temperature=0.1
        )
        json_str = response.choices[0].message.content
        
        
        if json_str.startswith("```"):
            json_str = re.sub(r"```(json|JSON)?", "", json_str).strip()
        
    
        parsed_json = json.loads(json_str)
        clean_json_str = json.dumps(parsed_json, ensure_ascii=False)
        
        return clean_json_str

    except Exception as e:
        return "[]"

In [None]:

# PHASE I: LOGIC AT SCALE
# (Test DeepSeek Only on Synthetic Text)

print("\n" + "="*50)
print("  PHASE I: LOGIC AT SCALE (Text Only)")
print("="*50)

try:
    synthetic_df = pd.read_csv(SYNTHETIC_DATA_FILE)
    synthetic_df = synthetic_df.head(1000)
    print(f"Loaded {len(synthetic_df)} synthetic text cases.")
except FileNotFoundError:
    print("WARNING: Synthetic data file not found. Skipping Phase 1.")
    synthetic_df = pd.DataFrame()

# AI generated: google gemini 2
group_b_results = []
PHASE_1_OUTPUT_FILE = "phase1_results.csv"  # save result

if not synthetic_df.empty:
    # Use tqdm for a progress bar
    for index, row in tqdm(synthetic_df.iterrows(), total=len(synthetic_df), desc="Testing DeepSeek Logic"):
        text_input = row['speech_input']
        expected_json = row['expected_json']
        
        # --- 1. Latency Check Start ---
        start_time = time.time()
        
        # Call LLM directly (No Whisper)
        actual_json_str = extract_expenses_llm(text_input)
        
        # --- 2. Latency Check End ---
        end_time = time.time()
        latency = end_time - start_time
        
        # --- 3. Calculate Accuracy  ---
        final_acc, slot_acc, cat_acc, status = calculate_accuracy(expected_json, actual_json_str)
        
        # --- 4. Save Detailed Data  ---
        group_b_results.append({
            "Case ID": index + 1,
            "Latency (s)": round(latency, 2),
            "Final JSON Acc": final_acc,
            "Slot Acc": slot_acc,
            "Category Acc": cat_acc,
            "Status": status,
            "Input Text": text_input,
            "Actual JSON": actual_json_str,
            "Expected JSON": expected_json
        })

    # --- 5. Save & Summary ---
    df_b = pd.DataFrame(group_b_results)
    
    df_b.to_csv(PHASE_1_OUTPUT_FILE, index=False)
    print(f"\n Phase I results saved to {PHASE_1_OUTPUT_FILE}")

    print("\n--- Phase I Summary (DeepSeek Logic) ---")
    print(f"Total Synthetic Cases: {len(df_b)}")
    print(f"Average Latency (s): {df_b['Latency (s)'].mean():.2f}") 
    print(f"Average Slot Accuracy: {df_b['Slot Acc'].mean():.2%}")
    print(f"Average Category Accuracy: {df_b['Category Acc'].mean():.2%}") 
    print(f"Average Final JSON Accuracy: {df_b['Final JSON Acc'].mean():.2%}")
else:
    print("Skipping Phase I analysis.")



  PHASE I: LOGIC AT SCALE (Text Only)
Loaded 1000 synthetic text cases.


Testing DeepSeek Logic: 100%|██████████| 1000/1000 [49:13<00:00,  2.95s/it]


 Phase I results saved to phase1_results.csv

--- Phase I Summary (DeepSeek Logic) ---
Total Synthetic Cases: 1000
Average Latency (s): 2.95
Average Slot Accuracy: 80.58%
Average Category Accuracy: 88.94%
Average Final JSON Accuracy: 60.40%





In [97]:
# Phase I Analysis
import pandas as pd
import json

# read result
df = pd.read_csv("phase1_results.csv")
failures = df[df['Final JSON Acc'] < 1.0].copy()

print(f"Total Failures: {len(failures)}")

# define type of failure
def classify_failure(row):
    try:
        actual = json.loads(row['Actual JSON'])
        expected = json.loads(row['Expected JSON'])
    except:
        return "Invalid JSON"

    # 1. length mismatch
    if len(actual) != len(expected):
        return "Length Mismatch (Hallucination/Omission)"
    
    # 2. Amount and category are correct，but item name is wrong
    if row['Category Acc'] == 1.0 and row['Final JSON Acc'] < 1.0:
        return "Item String Mismatch (Logic Correct)"
    
    # 3. wrong category
    if row['Category Acc'] < 1.0:
        return "Category Classification Error"
    
    return "Other Format Error"

failures['Error Type'] = failures.apply(classify_failure, axis=1)

error_distribution = failures['Error Type'].value_counts()
print("\n--- Failure Distribution ---")
print(error_distribution)

print("\n--- Error Percentage ---")
print(error_distribution / len(failures) * 100)

Total Failures: 396

--- Failure Distribution ---
Error Type
Item String Mismatch (Logic Correct)        265
Category Classification Error               112
Length Mismatch (Hallucination/Omission)     19
Name: count, dtype: int64

--- Error Percentage ---
Error Type
Item String Mismatch (Logic Correct)        66.919192
Category Classification Error               28.282828
Length Mismatch (Hallucination/Omission)     4.797980
Name: count, dtype: float64


In [None]:
# PHASE II: ROBUSTNESS ANALYSIS
# (Test Full Pipeline on Manual Audio)

print("\n" + "="*50)
print("  PHASE II: ROBUSTNESS (Test Full Pipeline on Manual Audio)")
print("="*50)

gt_df = pd.read_csv(GROUND_TRUTH_FILE, sep='|', encoding='utf-8', on_bad_lines='skip') 
test_results_a = []

print(f"Starting evaluation of {len(gt_df)} manual audio cases...")

for index, row in gt_df.iterrows():
    filename = row['filename']
    audio_path = os.path.join(TEST_AUDIO_DIR, filename)
    
    print(f"Processing {filename}...")
    
    if not os.path.exists(audio_path):
        print(f"ERROR: Audio file not found at {audio_path}")
        continue
    
    start_time = time.time()
    
    # --- 1. Run Pipeline ---
    try:
        # Transcribe (Whisper)
        transcript_result = model.transcribe(
            audio_path,
            initial_prompt="The following text is about financial expenses, items, and dollar amounts."
        )
        transcript = transcript_result["text"].strip()
        
        # Extract (DeepSeek)
        actual_json_str = extract_expenses_llm(transcript)
    
    except Exception as e:
        transcript = f"TRANSCRIPTION_ERROR: {e}"
        actual_json_str = "[]"
    
    end_time = time.time()
    
    # --- 2. Calculate Metrics  ---
    final_acc, slot_acc, cat_acc, status_msg = calculate_accuracy(row['expected_json'], actual_json_str)

    # --- 3. Save Results ---
    test_results_a.append({
        "Case ID": index + 1,
        "Type": row.get('type', 'Normal'), 
        "Status": status_msg,
        "Audio File": filename,
        "Latency (s)": round(end_time - start_time, 2), 
        "Final JSON Acc": final_acc, 
        "Slot Acc": slot_acc, 
        "Category Acc": cat_acc,
        "Transcript": transcript,
        "Actual JSON": actual_json_str, 
        "Expected JSON": row['expected_json'],  
        "Notes": row['evaluation_notes']
    })

# AI generated: google gemini 2
# --- Final Analysis for Phase II ---
final_df_a = pd.DataFrame(test_results_a)
final_df_a.to_csv(OUTPUT_FILE, index=False)

# Force numeric types
final_df_a['Final JSON Acc'] = pd.to_numeric(final_df_a['Final JSON Acc'])
final_df_a['Slot Acc'] = pd.to_numeric(final_df_a['Slot Acc'])
final_df_a['Category Acc'] = pd.to_numeric(final_df_a['Category Acc'])
final_df_a['Latency (s)'] = pd.to_numeric(final_df_a['Latency (s)'])

print("\n--- Phase II Summary (Full Pipeline) ---")
print(f"Total Audio Cases: {len(final_df_a)}")
print(f"Average System Latency (s): {final_df_a['Latency (s)'].mean():.2f} ")
print(f"Average Final JSON Accuracy: {final_df_a['Final JSON Acc'].mean():.2%} ")
print(f"Average Slot Accuracy: {final_df_a['Slot Acc'].mean():.2%} ")
print(f"Average Category Accuracy: {final_df_a['Category Acc'].mean():.2%} ")

# --- Table 1: Performance by Category (Summary) ---
# Added Latency Mean to the summary table too!
type_summary = final_df_a.groupby('Type')[['Slot Acc', 'Category Acc', 'Final JSON Acc', 'Latency (s)']].mean()
type_summary['Count'] = final_df_a['Type'].value_counts()

print("\n--- Table 1: Performance by Category (Summary) ---")
display(type_summary)

# --- Table 2: All Cases Detailed View ---
print("\n--- Table 2: All Cases Detailed View (Sorted by Type) ---")
pd.set_option('display.max_colwidth', None)

detail_cols = ["Case ID", "Type", "Latency (s)", "Final JSON Acc", "Slot Acc", "Category Acc", "Transcript", "Actual JSON", "Expected JSON", "Status"]

sorted_df = final_df_a.sort_values(by=['Type', 'Case ID'])
display(sorted_df[detail_cols])

# --- Table 3: Failure Analysis ---
print("\n--- Table 3: Detailed Failure Analysis (Failures Only) ---")

failures_df = final_df_a[final_df_a['Final JSON Acc'] < 1.0]

print(f"DEBUG: Found {len(failures_df)} failures out of {len(final_df_a)} total cases.")

if not failures_df.empty:
    sorted_failures = failures_df.sort_values(by=['Type', 'Case ID'])
    display(sorted_failures[detail_cols])
else:
    print(" No failures found.")



  PHASE II: ROBUSTNESS (Test Full Pipeline on Manual Audio)
Starting evaluation of 50 manual audio cases...
Processing 01.m4a...




Processing 02.m4a...




Processing 03.m4a...




Processing 04.m4a...




Processing 05.m4a...




Processing 06.m4a...




Processing 07.m4a...




Processing 08.m4a...




Processing 09.m4a...




Processing 10.m4a...




Processing 11.m4a...




Processing 12.m4a...




Processing 13.m4a...




Processing 14.m4a...




Processing 15.m4a...




Processing 16.m4a...




Processing 17.m4a...




Processing 18.m4a...




Processing 19.m4a...




Processing 20.m4a...




Processing 21.m4a...




Processing 22.m4a...




Processing 23.m4a...




Processing 24.m4a...




Processing 25.m4a...




Processing 26.m4a...




Processing 27.m4a...




Processing 28.m4a...




Processing 29.m4a...




Processing 30.m4a...




Processing 31.m4a...




Processing 32.m4a...




Processing 33.m4a...




Processing 34.m4a...




Processing 35.m4a...




Processing 36.m4a...




Processing 37.m4a...




Processing 38.m4a...




Processing 39.m4a...




Processing 40.m4a...




Processing 41.m4a...




Processing 42.m4a...




Processing 43.m4a...




Processing 44.m4a...




Processing 45.m4a...




Processing 46.m4a...




Processing 47.m4a...




Processing 48.m4a...




Processing 49.m4a...




Processing 50.m4a...





--- Phase II Summary (Full Pipeline) ---
Total Audio Cases: 50
Average System Latency (s): 3.83 
Average Final JSON Accuracy: 58.00% 
Average Slot Accuracy: 80.44% 
Average Category Accuracy: 76.33% 

--- Table 1: Performance by Category (Summary) ---


Unnamed: 0_level_0,Slot Acc,Category Acc,Final JSON Acc,Latency (s),Count
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Edge_Accent,0.666667,1.0,0.0,3.31,2
Edge_Category,0.833333,1.0,0.5,3.24,2
Edge_Logic,0.861111,0.645833,0.5,4.7775,8
Edge_Negative,1.0,1.0,1.0,3.166667,3
Edge_Noise,0.472222,0.5,0.166667,4.053333,6
Edge_Unit,0.866667,0.8,0.6,3.966,5
Guardrail_Toxic,1.0,1.0,1.0,2.855,4
Normal,0.808333,0.75,0.65,3.7615,20



--- Table 2: All Cases Detailed View (Sorted by Type) ---


Unnamed: 0,Case ID,Type,Latency (s),Final JSON Acc,Slot Acc,Category Acc,Transcript,Actual JSON,Expected JSON,Status
33,34,Edge_Accent,3.45,0.0,0.666667,1.0,Grab a coffee cost me about 350.,"[{""item"": ""Coffee"", ""amount"": 350.0, ""category"": ""Food""}]","[{""item"": ""Coffee"", ""amount"": 3.50, ""category"": ""Food""}]",FAIL
36,37,Edge_Accent,3.17,0.0,0.666667,1.0,We had sushi for dinner. It was $40.,"[{""item"": ""Sushi dinner"", ""amount"": 40.0, ""category"": ""Food""}]","[{""item"": ""Sushi"", ""amount"": 40.00, ""category"": ""Food""}]",FAIL
32,33,Edge_Category,3.27,0.0,0.666667,1.0,I made a donation of $50 to charity.,"[{""item"": ""Donation to charity"", ""amount"": 50.0, ""category"": ""Other""}]","[{""item"": ""Donation"", ""amount"": 50.00, ""category"": ""Other""}]",FAIL
44,45,Edge_Category,3.21,1.0,1.0,1.0,Pay the online course fee $99.,"[{""item"": ""Online Course Fee"", ""amount"": 99.0, ""category"": ""Other""}]","[{""item"": ""Online Course Fee"", ""amount"": 99.00, ""category"": ""Other""}]",PASS
10,11,Edge_Logic,4.32,0.0,0.5,0.0,"The total was 18. Wait, let me check that receipt. Make that $20 and another 20 for the tip.","[{""item"": ""Receipt total"", ""amount"": 20.0, ""category"": ""Other""}, {""item"": ""Tip"", ""amount"": 20.0, ""category"": ""Other""}]","[{""item"": ""Meal"", ""amount"": 20.00, ""category"": ""Food""}, {""item"": ""Tip"", ""amount"": 20.00, ""category"": ""Food""}]",FAIL
11,12,Edge_Logic,6.35,1.0,1.0,1.0,I purchased a game controller for $50 and bought some snacks for $5.,"[{""item"": ""Game Controller"", ""amount"": 50.0, ""category"": ""Retail""}, {""item"": ""Snacks"", ""amount"": 5.0, ""category"": ""Food""}]","[{""item"": ""Game Controller"", ""amount"": 50.00, ""category"": ""Retail""}, {""item"": ""Snacks"", ""amount"": 5.00, ""category"": ""Food""}]",PASS
12,13,Edge_Logic,5.14,0.0,0.888889,0.666667,I need to get some gas that was $72.50. Then I picked up some dog food for $30 and I saw a nice shirt for $55.,"[{""item"": ""Gas"", ""amount"": 72.5, ""category"": ""Transport""}, {""item"": ""Dog Food"", ""amount"": 30.0, ""category"": ""Food""}, {""item"": ""Shirt"", ""amount"": 55.0, ""category"": ""Retail""}]","[{""item"": ""Gas"", ""amount"": 72.50, ""category"": ""Transport""}, {""item"": ""Dog Food"", ""amount"": 30.00, ""category"": ""Retail""}, {""item"": ""Shirt"", ""amount"": 55.00, ""category"": ""Retail""}]",FAIL
16,17,Edge_Logic,5.63,1.0,1.0,1.0,The taxi was $5 and the toll was $3.,"[{""item"": ""Taxi"", ""amount"": 5.0, ""category"": ""Transport""}, {""item"": ""Toll"", ""amount"": 3.0, ""category"": ""Transport""}]","[{""item"": ""Taxi"", ""amount"": 5.00, ""category"": ""Transport""}, {""item"": ""Toll"", ""amount"": 3.00, ""category"": ""Transport""}]",PASS
30,31,Edge_Logic,3.37,0.0,0.666667,0.0,"I paid 1200 for rent, but my roommate transferred me half, so I actually only spent 600.","[{""item"": ""Rent"", ""amount"": 600.0, ""category"": ""Other""}]","[{""item"": ""Rent"", ""amount"": 600.00, ""category"": ""Utilities""}]",FAIL
31,32,Edge_Logic,4.61,0.0,0.833333,0.5,Booked a flight to Paris for 800 and the hotel was 450.,"[{""item"": ""Flight to Paris"", ""amount"": 800.0, ""category"": ""Transport""}, {""item"": ""Hotel"", ""amount"": 450.0, ""category"": ""Retail""}]","[{""item"": ""Flight to Paris"", ""amount"": 800.00, ""category"": ""Transport""}, {""item"": ""Hotel"", ""amount"": 450.00, ""category"": ""Other""}]",FAIL



--- Table 3: Detailed Failure Analysis (Failures Only) ---
DEBUG: Found 21 failures out of 50 total cases.


Unnamed: 0,Case ID,Type,Latency (s),Final JSON Acc,Slot Acc,Category Acc,Transcript,Actual JSON,Expected JSON,Status
33,34,Edge_Accent,3.45,0.0,0.666667,1.0,Grab a coffee cost me about 350.,"[{""item"": ""Coffee"", ""amount"": 350.0, ""category"": ""Food""}]","[{""item"": ""Coffee"", ""amount"": 3.50, ""category"": ""Food""}]",FAIL
36,37,Edge_Accent,3.17,0.0,0.666667,1.0,We had sushi for dinner. It was $40.,"[{""item"": ""Sushi dinner"", ""amount"": 40.0, ""category"": ""Food""}]","[{""item"": ""Sushi"", ""amount"": 40.00, ""category"": ""Food""}]",FAIL
32,33,Edge_Category,3.27,0.0,0.666667,1.0,I made a donation of $50 to charity.,"[{""item"": ""Donation to charity"", ""amount"": 50.0, ""category"": ""Other""}]","[{""item"": ""Donation"", ""amount"": 50.00, ""category"": ""Other""}]",FAIL
10,11,Edge_Logic,4.32,0.0,0.5,0.0,"The total was 18. Wait, let me check that receipt. Make that $20 and another 20 for the tip.","[{""item"": ""Receipt total"", ""amount"": 20.0, ""category"": ""Other""}, {""item"": ""Tip"", ""amount"": 20.0, ""category"": ""Other""}]","[{""item"": ""Meal"", ""amount"": 20.00, ""category"": ""Food""}, {""item"": ""Tip"", ""amount"": 20.00, ""category"": ""Food""}]",FAIL
12,13,Edge_Logic,5.14,0.0,0.888889,0.666667,I need to get some gas that was $72.50. Then I picked up some dog food for $30 and I saw a nice shirt for $55.,"[{""item"": ""Gas"", ""amount"": 72.5, ""category"": ""Transport""}, {""item"": ""Dog Food"", ""amount"": 30.0, ""category"": ""Food""}, {""item"": ""Shirt"", ""amount"": 55.0, ""category"": ""Retail""}]","[{""item"": ""Gas"", ""amount"": 72.50, ""category"": ""Transport""}, {""item"": ""Dog Food"", ""amount"": 30.00, ""category"": ""Retail""}, {""item"": ""Shirt"", ""amount"": 55.00, ""category"": ""Retail""}]",FAIL
30,31,Edge_Logic,3.37,0.0,0.666667,0.0,"I paid 1200 for rent, but my roommate transferred me half, so I actually only spent 600.","[{""item"": ""Rent"", ""amount"": 600.0, ""category"": ""Other""}]","[{""item"": ""Rent"", ""amount"": 600.00, ""category"": ""Utilities""}]",FAIL
31,32,Edge_Logic,4.61,0.0,0.833333,0.5,Booked a flight to Paris for 800 and the hotel was 450.,"[{""item"": ""Flight to Paris"", ""amount"": 800.0, ""category"": ""Transport""}, {""item"": ""Hotel"", ""amount"": 450.0, ""category"": ""Retail""}]","[{""item"": ""Flight to Paris"", ""amount"": 800.00, ""category"": ""Transport""}, {""item"": ""Hotel"", ""amount"": 450.00, ""category"": ""Other""}]",FAIL
15,16,Edge_Noise,3.89,0.0,0.0,0.0,I got a hair cut for $32 and a tip for fire.,"[{""item"": ""Hair cut"", ""amount"": 32.0, ""category"": ""Personal Care""}]","[{""item"": ""Hair Cut"", ""amount"": 32.00, ""category"": ""Personal Care""}, {""item"": ""Tip"", ""amount"": 5.00, ""category"": ""Personal Care""}]",FAIL: Length Mismatch
17,18,Edge_Noise,4.06,0.0,0.166667,0.5,My fee was $6 and my dinner cost $20.,"[{""item"": ""Fee"", ""amount"": 6.0, ""category"": ""Other""}, {""item"": ""Dinner"", ""amount"": 20.0, ""category"": ""Food""}]","[{""item"": ""Coffee"", ""amount"": 6.00, ""category"": ""Food""}, {""item"": ""Dinner"", ""amount"": 20.00, ""category"": ""Food""}]",FAIL
18,19,Edge_Noise,4.59,0.0,0.666667,0.5,I bought a pair of high phones for $70 and mine for $15.,"[{""item"": ""Headphones"", ""amount"": 70.0, ""category"": ""Retail""}, {""item"": ""Mine"", ""amount"": 15.0, ""category"": ""Other""}]","[{""item"": ""Headphones"", ""amount"": 70.00, ""category"": ""Retail""}, {""item"": ""Lunch"", ""amount"": 15.00, ""category"": ""Food""}]",FAIL
