In [23]:
import json
import pandas as pd
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval import assert_test

# Load model outputs — each element is a JSON string
with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_sys_stimhartnow_result.json", "r") as f:
    output_sys = json.load(f)

# Load model outputs — each element is a JSON string
with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_gpt4o_domain_stimhartnow_result.json", "r") as f:
    output_gpt4o = json.load(f)

insight_sys = (
    output_sys["analysis"]["analysis"]["descriptive"] + " " +
    output_sys["analysis"]["analysis"]["predictive"] + " " +
    output_sys["analysis"]["analysis"]["domain_related"]
)

insight_gpt4o = (
    output_gpt4o["insights"]["customer_retention"]["insight"] + " " +
    output_gpt4o["insights"]["acquisition_channels"]["insight"] + " " +
    output_gpt4o["insights"]["customer_demographics"]["insight"] + " " +
    output_gpt4o["insights"]["financial_analysis"]["insight"] + " " +
    output_gpt4o["insights"]["contract_analysis"]["insight"] + " " +
    output_gpt4o["insights"]["premium_customers"]["insight"]
)

# 3. Create a test case without reference output
test_case_sys = LLMTestCase(
    input="Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.",
    actual_output=insight_sys,
)

test_case_gpt4o = LLMTestCase(
    input="Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.",
    actual_output=insight_gpt4o,
)

In [24]:
# 4. Define GEval metrics (self-evaluation — no expected_output)
insightful = GEval(
    name="Insightful",
    criteria="Does the output offer a deep or non-obvious understanding? Does it connect patterns or trends that aren't immediately apparent?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

novelty = GEval(
    name="Novelty",
    criteria="Does the output go beyond generic interpretation? Would it surprise or teach something new to a domain expert?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

domain_relevance = GEval(
    name="Depth",
    criteria="Does the analysis demonstrate deep domain expertise in the specific domain?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [25]:
# # 5. Run evaluation (assertion-based print)
# print("\n=== Insight Evaluation Report ===")
# assert_test(test_case_sys, [insightful, novelty, domain_relevance])

In [None]:
from deepeval import evaluate

results_sys = evaluate(
    test_cases=[test_case_sys],
    metrics=[insightful, novelty, domain_relevance],
)

# ── pretty‑print ───────────────────────────────
for label, test_case_list in results_sys:
    print(f"\n==== Label: {label} ====")

    if test_case_list is None:
        print("⚠️  No results for this label.\n")
        continue

    for test_case in test_case_list:
        print(f"Input:  {test_case.input}")
        print(f"Output: {test_case.actual_output[:300]}...")

        for metric in test_case.metrics_data:
            print(f"{metric.name:<25}: {metric.score:.2f}  |  {metric.reason}")





Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.34s/test case]



Metrics Summary

  - ✅ Insightful (GEval) (score: 0.8991645704395317, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides deeper insights by analyzing churn rates, customer value profiles, and acquisition strategies, which are not immediately obvious from the input. It connects patterns between acquisition channels, customer demographics, and retention likelihood, revealing relationships not directly mentioned in the input. The output challenges assumptions by highlighting the importance of internal channels and premium status in revenue contribution, offering a unique perspective on CRM strategies., error: None)
  - ✅ Novelty (GEval) (score: 0.577571325829566, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides a detailed analysis of the dataset, highlighting the importance of acquisition channels, customer demographics, and retention strategies. However, it largely reiterates common CRM insights without offering nove





==== Label: test_results ====
Input:  Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.
Output: The dataset provides a comprehensive view of customer interactions and financial metrics within a CRM framework. Notably, the 'Current Customer (Y/N)' column indicates a significant churn rate, with many customers no longer active. The 'Revenue, Total' and 'Expected CLV' columns show a wide variance...
Insightful (GEval)       : 0.90  |  The output provides deeper insights by analyzing churn rates, customer value profiles, and acquisition strategies, which are not immediately obvious from the input. It connects patterns between acquisition channels, customer demographics, and retention likelihood, revealing relationships not directly mentioned in the input. The output challenges assumptions by highlighting the importance of internal channels and premium status in revenue contribution, offering a unique perspective on CRM 

In [27]:
results_gpt4o = evaluate(
    test_cases=[test_case_gpt4o],
    metrics=[insightful, novelty, domain_relevance],
)

# ── pretty‑print ───────────────────────────────
for label, test_case_list in results_gpt4o:
    print(f"\n==== Label: {label} ====")

    if test_case_list is None:
        print("⚠️  No results for this label.\n")
        continue

    for test_case in test_case_list:
        print(f"Input:  {test_case.input}")
        print(f"Output: {test_case.actual_output[:300]}...")

        for metric in test_case.metrics_data:
            print(f"{metric.name:<25}: {metric.score:.2f}  |  {metric.reason}")

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.86s/test case]



Metrics Summary

  - ✅ Insightful (GEval) (score: 0.8361800300238462, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The Actual Output identifies deeper insights such as potential issues with customer retention strategies, reliance on external channels for acquisition, and the opportunity for growth in premium customers, which are not immediately apparent from the Input., error: None)
  - ❌ Novelty (GEval) (score: 0.4345197488302194, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides some insights like the reliance on external channels and potential growth in premium customers, but these are not particularly novel or surprising to a domain expert. The analysis lacks depth and does not introduce new concepts or findings beyond a generic interpretation of the dataset., error: None)
  - ✅ Depth (GEval) (score: 0.8143466290986604, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output accurately discusses customer r





==== Label: test_results ====
Input:  Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.
Output: The majority of customers have terminated their contracts, indicating potential issues with customer retention strategies. Most customers are acquired through external channels, suggesting a reliance on external marketing or partnerships for customer acquisition. The majority of customers are small ...
Insightful (GEval)       : 0.84  |  The Actual Output identifies deeper insights such as potential issues with customer retention strategies, reliance on external channels for acquisition, and the opportunity for growth in premium customers, which are not immediately apparent from the Input.
Novelty (GEval)          : 0.43  |  The output provides some insights like the reliance on external channels and potential growth in premium customers, but these are not particularly novel or surprising to a domain expert. The analysis 