# Imports and Config

In [1]:
import os
import json
from deepeval.metrics import FaithfulnessMetric, HallucinationMetric
from deepeval import evaluate
from deepeval.test_case import LLMTestCase

In [2]:
os.environ["OPENAI_API_KEY"] = ""

In [3]:
model_to_use = "o4-mini"

# Base Data

In [4]:
property_listing = {
    "security_deposit": 2000,
    "monthly_rent": 2500,
    "pet_policy": [
        "Small dogs allowed (under 25 lbs.)"
    ],
    "smoking_policy": "Non-smoking property",
    "amenities": [
        "Swimming pool",
        "Fitness center",
        "In-unit laundry",
        "Central air conditioning",
        "Balcony/Patio",
        "Smart home features",
        "Assigned parking"
    ],
    "lease_term": "12 months",
    "property_type": "Apartment",
    "bedrooms": 2,
    "bathrooms": 2,
    "square_feet": 1200,
    "furnished": False,
    "utilities_included": [
        "Water",
        "Trash removal"
    ],
    "laundry": "In-unit washer/dryer",
    "parking": "One assigned space",
    "neighborhood": "Downtown",
    "year_built": 2020,
    "accessibility": [
        "Elevator",
        "Wheelchair ramp"
    ],
    "appliances": [
        "Stainless steel refrigerator",
        "Gas range",
        "Dishwasher",
        "Microwave"
    ],
    "flooring": "Hardwood",
    "availability_date": "2024-03-01",
    "application_fee": 75,
    "property_services": [
        "24/7 maintenance",
        "Package reception",
        "On-site management"
    ],
    "outdoor_spaces": [
        "Community garden",
        "Rooftop terrace"
    ],
    "security_features": [
        "Keyless entry",
        "Security cameras",
        "Intercom system"
    ],
    "transportation": [
        "0.5 miles to subway station",
        "Bike share station on-site"
    ],
    "name": "CityView Luxury Apartments",
    "phone": "555-123-4567",
    "management_company": "Urban Living Properties",
    "website": "https://cityviewluxury.com",
    "address": "123 Main St, Metropolis, NY 10001"
}

In [5]:
property_listing_str = json.dumps(property_listing)

In [6]:
factual_test_case = LLMTestCase(
    input="P. S. My roommate will give you very positive references for the indoor-only felines.",
    actual_output="A member of our team will be reaching out shortly to confirm this date and time. If you need immediate assistance or would like to confirm the requested tour, you can contact us directly at 000-000-0000.",
    retrieval_context=[property_listing_str]
)

In [7]:
hallu_test_case = LLMTestCase(
    input="I've been staying with a best friend in CityView. Looking for a place of my own for me and the 2 indoor cats. I am retired and want to have my own furnishings, TV choices, etc.",
    actual_output="Hi there! Thanks for getting in touch with CityView Luxury Apartments. We're happy to let you know that we allow cats. We provide a variety of services and amenities and are eager to assist you in any way we can. When would be a good time for you to visit our property for a tour and chat about any questions you may have?",
    context=[property_listing_str]
)

# Eval Metrics

In [8]:
fact_metric = FaithfulnessMetric(
    threshold=0.7,
    model=model_to_use,
    include_reason=True
)

In [9]:
hallu_metric = HallucinationMetric(
    threshold=0.7,
    model=model_to_use,
    include_reason=True
)

# Factual Evalulation

In [10]:
evaluate(test_cases=[factual_test_case], metrics=[fact_metric])

Evaluating 1 test case(s) in parallel: |██████████████████████████████████████████████████████████████████████████████████████████████████|100% (1/1) [Time Taken: 00:11, 11.63s/test case]



Metrics Summary

  - ❌ Faithfulness (score: 0.5, threshold: 0.7, strict: False, evaluation model: o4-mini, reason: The score is 0.50 because the actual output misstates the contact phone number as 000-000-0000 rather than the correct 555-123-4567 from the retrieval context., error: None)

For test case:

  - input: P. S. My roommate will give you very positive references for the indoor-only felines.
  - actual output: A member of our team will be reaching out shortly to confirm this date and time. If you need immediate assistance or would like to confirm the requested tour, you can contact us directly at 000-000-0000.
  - expected output: None
  - context: None
  - retrieval context: ['{"security_deposit": 2000, "monthly_rent": 2500, "pet_policy": ["Small dogs allowed (under 25 lbs.)"], "smoking_policy": "Non-smoking property", "amenities": ["Swimming pool", "Fitness center", "In-unit laundry", "Central air conditioning", "Balcony/Patio", "Smart home features", "Assigned parking"], "




EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Faithfulness', threshold=0.7, success=False, score=0.5, reason='The score is 0.50 because the actual output misstates the contact phone number as 000-000-0000 rather than the correct 555-123-4567 from the retrieval context.', strict_mode=False, evaluation_model='o4-mini', error=None, evaluation_cost=0.0106359, verbose_logs='Truths (limit=None):\n[\n    "The security deposit is 2000.",\n    "The monthly rent is 2500.",\n    "Small dogs under 25 lbs. are allowed under the pet policy.",\n    "The smoking policy designates this as a non-smoking property.",\n    "The amenities include a swimming pool.",\n    "The amenities include a fitness center.",\n    "The amenities include in-unit laundry.",\n    "The amenities include central air conditioning.",\n    "The amenities include a balcony or patio.",\n    "The amenities include smart home features.",\n    "The amenities include assign

# Hallucination Evaluation

In [11]:
evaluate(test_cases=[hallu_test_case], metrics=[hallu_metric])

Evaluating 1 test case(s) in parallel: |██████████████████████████████████████████████████████████████████████████████████████████████████|100% (1/1) [Time Taken: 00:05,  5.60s/test case]



Metrics Summary

  - ❌ Hallucination (score: 1.0, threshold: 0.7, strict: False, evaluation model: o4-mini, reason: The score is 1.00 because the output incorrectly states that cats are allowed while the context only permits small dogs under 25 lbs., error: None)

For test case:

  - input: I've been staying with a best friend in CityView. Looking for a place of my own for me and the 2 indoor cats. I am retired and want to have my own furnishings, TV choices, etc.
  - actual output: Hi there! Thanks for getting in touch with CityView Luxury Apartments. We're happy to let you know that we allow cats. We provide a variety of services and amenities and are eager to assist you in any way we can. When would be a good time for you to visit our property for a tour and chat about any questions you may have?
  - expected output: None
  - context: ['{"security_deposit": 2000, "monthly_rent": 2500, "pet_policy": ["Small dogs allowed (under 25 lbs.)"], "smoking_policy": "Non-smoking property", "




EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Hallucination', threshold=0.7, success=False, score=1.0, reason='The score is 1.00 because the output incorrectly states that cats are allowed while the context only permits small dogs under 25 lbs.', strict_mode=False, evaluation_model='o4-mini', error=None, evaluation_cost=0.0033682, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "no",\n        "reason": "The context specifies a pet policy of only small dogs (under 25 lbs.) and does not mention cats. The actual output incorrectly states that cats are allowed."\n    }\n]')], conversational=False, multimodal=False, input="I've been staying with a best friend in CityView. Looking for a place of my own for me and the 2 indoor cats. I am retired and want to have my own furnishings, TV choices, etc.", actual_output="Hi there! Thanks for getting in touch with CityView Luxury Apartments. We're happy to let you know that we allow