In [None]:
# GENERAL CRITERIA EVAL BENCHMARK

from src.metrics import Metrics
import pandas as pd
from dotenv import load_dotenv
from tabulate import tabulate
load_dotenv()
import os

model = os.getenv("MODEL")
api_key = os.getenv("OPENAI_API_KEY")

metrics = Metrics(model=model, api_key=api_key)

df = pd.read_csv("eval_benchmark.csv")

results = {
    "true_positives": [],
    "false_positives": [],
    "true_negatives": [],
    "false_negatives": []
}

for _, row in df.iterrows():
    full_result = metrics.criteria_eval(content=row["assistant"],criteria=[row["criterion"]])
    result = full_result["results"][0]["result"]

    expected = row["satisfies"]

    if result is True:
        if result == expected:
            results["true_positives"].append(row)
        else:
            results["false_positives"].append(row)
    else:
        if result == expected:
            results["true_negatives"].append(row)
        else:
            results["false_negatives"].append(row)
    print(f"Assistant: {row['assistant']} | Criterion: {row['criterion']} | Result: {result} | Expected: {expected}")

tp = len(results["true_positives"])
fp = len(results["false_positives"])
tn = len(results["true_negatives"])
fn = len(results["false_negatives"])

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

table = [
    ["True Positives", tp],
    ["False Positives", fp],
    ["True Negatives", tn],
    ["False Negatives", fn],
    ["Precision", f"{precision:.2f}"],
    ["Recall",    f"{recall:.2f}"],
    ["F1 Score",  f"{f1:.2f}"],
]
print(tabulate(table, headers=["Metric","Value"], tablefmt="fancy_grid"))

In [None]:
# CLAIM CHECK EXAMPLE

from src.data_sources import DataSource
from src.metrics import Metrics
from dotenv import load_dotenv
load_dotenv()
import os

model = os.getenv("MODEL")
api_key = os.getenv("ANTHROPIC_API_KEY")

metrics = Metrics(model=model, api_key=api_key)

with open("test_content.txt", "r", encoding="utf-8") as f:
    test_content = f.read()

result = metrics.claim_check(
    content=test_content,
    data_source=DataSource.WEB,
    urls=[
        "https://www.cdc.gov/diabetes/healthy-eating/diabetes-meal-planning.html",
        "https://www.cdc.gov/diabetes/hcp/clinical-guidance/index.html",
        "https://www.who.int/news-room/fact-sheets/detail/diabetes"
    ]
)

print(result)