In [None]:
import os
os.listdir()

In [None]:
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [None]:
from src.deepeval_engine.config_loader import build_runtime_config
from typing import List, Dict, Any
import json

from src.deepeval_engine.model_runner import ModelRunner
from deepeval.test_case import LLMTestCase
from src.deepeval_engine.deepeval_evaluator import DeepEvalEvaluator

In [None]:
config_path = "configs/deepeval_config.yaml"
runtime_cfg = build_runtime_config(None, config_path=config_path)

In [None]:
runtime_cfg

In [None]:
dataset_info = runtime_cfg.get("dataset") or {}
dataset_path = dataset_info.get("path")
dataset_path

In [None]:
def load_prompts_from_file(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("prompts file must be a JSON list of {input, expected?}")
    return data

In [None]:
prompts = load_prompts_from_file(dataset_path)
prompts

In [None]:
model_name = runtime_cfg["model_name"]
provider = runtime_cfg["provider"]
print(f"Model: {model_name} | Provider: {provider}")

In [None]:
judge_model = ModelRunner(model_name=model_name, provider=provider)
judge_model

In [None]:
max_tokens = int(runtime_cfg["generation"]["max_tokens"])
temperature = float(runtime_cfg["generation"]["temperature"])
responses = judge_model.generate_batch([p.get("input", "") for p in prompts], max_tokens=max_tokens, temperature=temperature)
responses

In [None]:
test_cases = []

for i, (p, out) in enumerate(zip(prompts, responses), 1):
    retrieval_context = p.get("retrieval_context")
    tc = LLMTestCase(
        input=p.get("input", ""),
        actual_output=out or "",
        expected_output=p.get("expected") or "",
        retrieval_context=retrieval_context,
    )

    metadata = {
            "sample_id": p.get("id", f"sample_{i}"),
            "protected_attributes": {
                "category": p.get("category", "general"),
                "difficulty": p.get("difficulty", "unknown"),
            },
        }
    test_cases.append({"test_case": tc, "metadata": metadata})

test_cases

In [None]:
class _SimpleConfig:
    def __init__(self) -> None:
        self.model = type("obj", (object,), {"model_id": None})()
        self.dataset = type("obj", (object,), {"name": "evaluation_dataset"})()

config_manager = type("obj", (object,), {"config": _SimpleConfig()})()

In [None]:
evaluator = DeepEvalEvaluator(
    config_manager=config_manager,
    output_dir=runtime_cfg["output_dir"],
    metric_thresholds=runtime_cfg["thresholds"],
)

In [None]:
metrics_config = runtime_cfg["metrics_config"]
metrics_config

In [None]:
results = evaluator.run_evaluation(test_cases_data=test_cases, metrics_config=metrics_config)
results

In [None]:
results_full_path = os.path.join(runtime_cfg["output_dir"], "deepeval_results.json")
with open(results_full_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

## Gatekeeper Implementation

In [1]:
from src.deepeval_engine.gatekeeper import _load_summary, _load_suite_yaml, evaluate_gate, _compare_value

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
summary_path = "artifacts/deepeval_results/deepeval_summary_20251112_130250.json"
suite_path = "suits/suite_core.yaml"

summary = _load_summary(summary_path)
suite = _load_suite_yaml(suite_path)

In [3]:
metric_summaries = summary.get("metric_summaries", {}) or {}
metrics_cfg = suite.get("metrics", {}) or {}

In [4]:
for metric_name, cfg in metrics_cfg.items():
    comparison = (cfg or {}).get("comparison", "gte")
    metric_stats = metric_summaries[metric_name]
    thresholds = (cfg or {}).get("thresholds", {}) or {}
    for stat_name, threshold in thresholds.items():
        break
    break

In [5]:
stat_name, threshold

('average_score', 0.99)

In [6]:
stat_name in metric_summaries

False

In [7]:
metric_stats

{'average_score': 0.965,
 'pass_rate': 100.0,
 'min_score': 0.909,
 'max_score': 1.0,
 'total_evaluated': 4}

In [8]:
_compare_value(
    metric_name,
    stat_name,
    float(metric_stats[stat_name]),
    float(threshold),
    comparison
)

(False, 'Knowledge Retention.average_score: 0.965 < threshold 0.990')