This notebook contains the code to evaluate the correctness of the fact find task.

I have used GEval to evaluate the correctness of the output. It's a technique to evaluate the correctness of the output by comparing the output with the ground truth with LLM-as-a-judge.

In [1]:
from fact_finder import find_facts
import json
from rich import print
import nest_asyncio
from glob import glob
from os.path import basename
from rich.progress import track

nest_asyncio.apply()

In [2]:
# Get transcripts and structured JSON (gold data or ground truth)
transcripts = glob("financial_dataset/transcripts/*.txt")
gold_data = glob("financial_dataset/structured_data/*.json")

ground_truth = []

for gold_file in track(
    gold_data, description="Processing gold data", total=len(gold_data)
):
    base_name = basename(gold_file).replace(".json", "")
    base_name = base_name.split("_")[-3:]
    transcript_file = f"financial_dataset/transcripts/transcript_client_profile_{base_name[0]}_{base_name[1]}_{base_name[2]}.txt"

    if transcript_file not in transcripts:
        print(
            f"Skipping {gold_file} because it doesn't have a corresponding transcript"
        )
        continue

    with open(gold_file, "r") as f:
        gold_data = json.loads(f.read())
    with open(transcript_file, "r") as f:
        transcript = f.read()

    ground_truth.append((transcript, gold_data))

Output()

In [3]:
from concurrent.futures import ThreadPoolExecutor


def process_single_transcript(transcript_data):
    transcript, gold_data = transcript_data
    facts = find_facts(transcript)
    return facts


raw_results = []
predicted = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [
        executor.submit(process_single_transcript, item) for item in ground_truth
    ]
    for future in track(
        futures,
        description="Running FactFinder",
        total=len(ground_truth),
        refresh_per_second=5,
    ):
        facts = future.result()
        raw_results.append(facts)
        predicted.append(facts.model_dump_json(indent=2, exclude_none=True))

Output()

In [4]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

correctness = GEval(
    name="Correctness",
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize incorrect facts.",
        "Omission of detail is allowed and ignore null values.",
    ],
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
    ],
)



In [5]:
test_cases = []
for actual_output, (_, expected_output) in zip(predicted, ground_truth):
    test_case = LLMTestCase(
        input="Do the values in the actual output correctly matched with the expected output?",
        actual_output=actual_output,
        expected_output=json.dumps(expected_output, indent=2, allow_nan=False),
    )
    test_cases.append(test_case)

In [6]:
evaluate(test_cases=test_cases, metrics=[correctness])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 13 test case(s) in parallel: |██████████|100% (13/13) [Time Taken: 00:05,  2.46test case/s]



Metrics Summary

  - ✅ Correctness (GEval) (score: 0.5112169743418594, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The Actual Output does not contradict Expected Output, but contains inconsistencies such as the income name and the risk tolerance differing between the two. Additionally, the omission of the first client's details and loans mortgages is not noted., error: None)

For test case:

  - input: Do the values in the actual output correctly matched with the expected output?
  - actual output: {
  "personal_details": {},
  "employment": {
    "client1": {
      "employment_status": "Retired"
    },
    "client2": {
      "occupation": "Accountant",
      "employer": "Pollard PLC"
    }
  },
  "previous_addresses": [],
  "dependants": [],
  "incomes": [
    {
      "owner": "Carol",
      "name": "Employment",
      "amount": 66431.0,
      "frequency": "Annual"
    }
  ],
  "expenses": {
    "loan_repayments": [],
    "housing_expenses": [
      {
        "o




EvaluationResult(test_results=[TestResult(name='test_case_12', success=True, metrics_data=[MetricData(name='Correctness (GEval)', threshold=0.5, success=True, score=0.5112169743418594, reason="The Actual Output does not contradict Expected Output, but contains inconsistencies such as the income name and the risk tolerance differing between the two. Additionally, the omission of the first client's details and loans mortgages is not noted.", strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0034649999999999998, verbose_logs='Criteria:\nNone \n \nEvaluation Steps:\n[\n    "Check whether the facts in \'actual output\' contradicts any facts in \'expected output\'",\n    "You should also heavily penalize incorrect facts.",\n    "Omission of detail is allowed and ignore null values."\n]')], conversational=False, multimodal=False, input='Do the values in the actual output correctly matched with the expected output?', actual_output='{\n  "personal_details": {},\n  "emp