In [1]:
from fact_finder import find_facts
import json
from rich import print
import nest_asyncio
from glob import glob
from os.path import basename
from rich.progress import track

nest_asyncio.apply()

In [2]:
# Get transcripts and structured JSON (gold data or ground truth)
transcripts = glob("financial_dataset/transcripts/*.txt")
gold_data = glob("financial_dataset/structured_data/*.json")

ground_truth = []

for gold_file in track(
    gold_data, description="Processing gold data", total=len(gold_data)
):
    base_name = basename(gold_file).replace(".json", "")
    base_name = base_name.split("_")[-3:]
    transcript_file = f"financial_dataset/transcripts/transcript_client_profile_{base_name[0]}_{base_name[1]}_{base_name[2]}.txt"

    if transcript_file not in transcripts:
        print(
            f"Skipping {gold_file} because it doesn't have a corresponding transcript"
        )
        continue

    with open(gold_file, "r") as f:
        gold_data = json.loads(f.read())
    with open(transcript_file, "r") as f:
        transcript = f.read()

    ground_truth.append((transcript, gold_data))

Output()

In [3]:
from concurrent.futures import ThreadPoolExecutor


def process_single_transcript(transcript_data):
    transcript, gold_data = transcript_data
    facts = find_facts(transcript)
    return facts


raw_results = []
predicted = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [
        executor.submit(process_single_transcript, item) for item in ground_truth
    ]
    for future in track(
        futures,
        description="Running FactFinder",
        total=len(ground_truth),
        refresh_per_second=5,
    ):
        facts = future.result()
        raw_results.append(facts)
        predicted.append(facts.model_dump_json(indent=2, exclude_none=True))

Output()

In [6]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

correctness = GEval(
    name="Correctness",
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize incorrect facts.",
        "Omission of detail is allowed and ignore null values.",
    ],
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
    ],
)

In [7]:
test_cases = []
for actual_output, (_, expected_output) in zip(predicted, ground_truth):
    test_case = LLMTestCase(
        input="Do the values in the actual output correctly matched with the expected output?",
        actual_output=actual_output,
        expected_output=json.dumps(expected_output, indent=2, allow_nan=False),
    )
    test_cases.append(test_case)

In [8]:
evaluate(test_cases=test_cases, metrics=[correctness])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 13 test case(s) in parallel: |██████████|100% (13/13) [Time Taken: 00:04,  3.21test case/s]



Metrics Summary

  - ✅ Correctness (GEval) (score: 0.5447819177347677, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: Actual Output is mostly consistent with Expected Output but has discrepancies: frequencies are incorrectly labeled as 'Annual' instead of 'Annually'; pension details are missing; mortgage-free status contradicts outstanding mortgage in Expected Output., error: None)

For test case:

  - input: Do the values in the actual output correctly matched with the expected output?
  - actual output: {
  "personal_details": {
    "client1": {
      "first_name": "Michael"
    },
    "client2": {
      "first_name": "Amber"
    }
  },
  "employment": {
    "client1": {
      "occupation": "Doctor",
      "employer": "Waters-Gonzales"
    },
    "client2": {
      "occupation": "Firefighter",
      "employer": "Wyatt-Torres"
    }
  },
  "current_address": {},
  "previous_addresses": [],
  "dependants": [],
  "incomes": [
    {
      "owner": "Michael",
      "na




EvaluationResult(test_results=[TestResult(name='test_case_1', success=True, metrics_data=[MetricData(name='Correctness (GEval)', threshold=0.5, success=True, score=0.5447819177347677, reason="Actual Output is mostly consistent with Expected Output but has discrepancies: frequencies are incorrectly labeled as 'Annual' instead of 'Annually'; pension details are missing; mortgage-free status contradicts outstanding mortgage in Expected Output.", strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.00388, verbose_logs='Criteria:\nNone \n \nEvaluation Steps:\n[\n    "Check whether the facts in \'actual output\' contradicts any facts in \'expected output\'",\n    "You should also heavily penalize incorrect facts.",\n    "Omission of detail is allowed and ignore null values."\n]')], conversational=False, multimodal=False, input='Do the values in the actual output correctly matched with the expected output?', actual_output='{\n  "personal_details": {\n    "client1": {\n 