# DeepEval Usage Demo (End-to-End)

This notebook shows how to use a Crucible-generated DeepEval export and run an evaluation loop.

## What this covers
1. Load exported `deepeval` config JSON from Crucible.
2. Generate actual outputs by calling your app endpoint.
3. Build `LLMTestCase` objects.
4. Run DeepEval metrics and inspect results.


In [None]:
# If needed, uncomment and run once:
# %pip install -U deepeval requests pandas


In [None]:
import json
from pathlib import Path

import requests

from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric, HallucinationMetric


## Usage Demo: Load Crucible export

Set path to the downloaded `.json` file from Crucible (output format: `deepeval`).

In [None]:
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR if (NOTEBOOK_DIR / 'backend').exists() else NOTEBOOK_DIR.parent
DOWNLOADS_DIR = PROJECT_ROOT / 'downloads'

# Option 1: set explicit filename
# DEEPEVAL_EXPORT_PATH = DOWNLOADS_DIR / 'crucible_rag_deepeval_YYYYMMDD_HHMMSS.json'

# Option 2: auto-pick latest deepeval export
candidates = sorted(DOWNLOADS_DIR.glob('crucible_*_deepeval_*.json'))
DEEPEVAL_EXPORT_PATH = candidates[-1] if candidates else DOWNLOADS_DIR / 'missing-deepeval-export.json'

if not DEEPEVAL_EXPORT_PATH.exists():
    raise FileNotFoundError(f'Update DEEPEVAL_EXPORT_PATH first: {DEEPEVAL_EXPORT_PATH}')

export_payload = json.loads(DEEPEVAL_EXPORT_PATH.read_text())
test_specs = export_payload['dataset']['test_cases']
len(test_specs)


## Usage Demo: Call your app to get actual outputs

In [None]:
APP_API_URL = 'http://localhost:8000/chat'  # change to your app endpoint

def call_app(question: str):
    resp = requests.post(APP_API_URL, json={'question': question}, timeout=30)
    resp.raise_for_status()

    data = resp.json()
    if isinstance(data, dict) and 'answer' in data:
        return data.get('answer', ''), data.get('contexts', []) or []

    return str(data), []


In [None]:
cases = []

for spec in test_specs:
    actual_output, retrieved_context = call_app(spec['input'])

    case = LLMTestCase(
        input=spec['input'],
        actual_output=actual_output,
        expected_output=spec.get('expected_output', ''),
        retrieval_context=retrieved_context if isinstance(retrieved_context, list) else [str(retrieved_context)],
    )
    cases.append(case)

len(cases)


## Configure metrics and evaluate

In [None]:
metrics = [
    AnswerRelevancyMetric(threshold=0.7),
    HallucinationMetric(threshold=0.5),
]

evaluation_result = evaluate(test_cases=cases, metrics=metrics)
evaluation_result


## Optional: Save a simple report

In [None]:
OUT_DIR = PROJECT_ROOT / 'outputs'
OUT_DIR.mkdir(parents=True, exist_ok=True)

report_path = OUT_DIR / 'deepeval_summary.txt'
report_path.write_text(str(evaluation_result))
report_path
