In [5]:
import os
from dotenv import load_dotenv
from pathlib import Path
import openai
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

# Load environment variables from .env file in the same directory
env_path = Path().absolute() / '.env'
load_dotenv(dotenv_path=env_path)

# Configure OpenAI
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
MODEL_NAME = os.getenv('OPENAI_MODEL', 'gpt-4')

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

# Set OpenAI API key globally
openai.api_key = OPENAI_API_KEY

print("OpenAI API key loaded successfully")

OpenAI API key loaded successfully


In [6]:
# Create correctness evaluation metric
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT
    ]
)

In [7]:
# Create test case
test_case = LLMTestCase(
    input="The dog chased the cat up the tree, who ran up the tree?",
    actual_output="It depends, some might consider the cat, while others might argue the dog.",
    expected_output="The cat."
)

# Run correctness evaluation
correctness_metric.measure(test_case)
print(f"Correctness Score: {correctness_metric.score}")
print(f"Reason: {correctness_metric.reason}")

Correctness Score: 0.24546227667201054
Reason: The actual output allows for interpretation and does not provide the specific fact that the cat ran up the tree as mentioned in the expected output.
