In [35]:
# This script shows how to call `run_batch_analysis_from_folder` in a pipeline
# and save / print the results with detailed history visualization

import os
import sys
from glob import glob
from typing import List
import json
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
from deepeval.metrics import GEval
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from deepeval import evaluate

# # ------------------------------------------------------------------
# # 1. Make sure Python can find your agent package / module
# #    (adapt the path if your repo layout is different)
# # ------------------------------------------------------------------
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
sys.path.append(ROOT_DIR)

from agents.d2insight_agent_sys import run_domain_detector   # noqa: E402

In [None]:
def load_dataset_dict(dataset_json_path):
    # load json
    with open(dataset_json_path, "r") as f:
        return json.load(f)

In [13]:
def run_batch_analysis_from_folder(folder_path: str, max_cycles: int = 5, json = False) -> pd.DataFrame:
    """
    Run domain detector on all CSV files in a folder.
    
    Args:
        folder_path: Path to the folder containing CSV files
        max_cycles: Max improvement cycles per dataset
    
    Returns:
        DataFrame with summary results
    """
    if json:
        paths = glob(os.path.join(folder_path, "*.json"))
    else:
        paths = glob(os.path.join(folder_path, "*.csv"))

    print(f"📁 Found {len(paths)} CSV files in {folder_path}")
    
    all_results = []

    for path in paths:
        print(f"\n📂 Analyzing: {os.path.basename(path)}")
        result = run_domain_detector(path, max_cycles=max_cycles)
        
        # Extract relevant fields safely
        analysis = result.get("analysis", {})
        domain = analysis.get("domain", "Unknown")
        concepts = ", ".join(analysis.get("core_concepts", []))
        desc = analysis.get("analysis", {}).get("descriptive", "")
        pred = analysis.get("analysis", {}).get("predictive", "")
        domain_rel = analysis.get("analysis", {}).get("domain_related", "")
        
        all_results.append({
            "filename": os.path.basename(path),
            "domain": domain,
            "core_concepts": concepts,
            "descriptive": desc,
            "predictive": pred,
            "domain_related": domain_rel,
            "final_scores": result.get("scores", {}),
            "num_iterations": result.get("iteration", 0),
            "history": result.get("history", [])
        })

    return all_results

In [14]:
notebook_folder = "../data/sample/"

insights_sys = run_batch_analysis_from_folder(notebook_folder, max_cycles=5)

📁 Found 1 CSV files in ../data/sample/

📂 Analyzing: flag-1.csv
Successfully read CSV: 500 rows, 14 columns
Data profile built successfully
Starting analysis with max_cycles=5
domain_node {'profile': {'raw': {'n_rows': 500, 'n_cols': 14, 'columns': {'category': {'dtype': 'object', 'unique_ratio': 0.01, 'sample': ['Software', 'Hardware', 'Hardware', 'Hardware', 'Hardware']}, 'state': {'dtype': 'object', 'unique_ratio': 0.004, 'sample': ['Closed', 'Closed', 'Closed', 'Resolved', 'Closed']}, 'closed_at': {'dtype': 'object', 'unique_ratio': 1.0, 'sample': ['2023-01-03 11:04:00.000000000', '2023-01-11 01:17:39.128189467', '2023-01-21 03:18:58.590910419', '2023-01-05 17:54:36.886511927', '2023-01-06 16:52:00.000000000']}, 'opened_at': {'dtype': 'object', 'unique_ratio': 1.0, 'sample': ['2023-01-02 11:04:00', '2023-01-03 10:19:00', '2023-01-04 06:37:00', '2023-01-04 06:53:00', '2023-01-05 16:52:00']}, 'closed_by': {'dtype': 'object', 'unique_ratio': 0.01, 'sample': ['Charlie Whitherspoon', 'B

In [32]:
insights_sys_full = insights_sys[0]['descriptive'] + " " + insights_sys[0]['predictive'] +  " " + insights_sys[0]['domain_related']

In [34]:
# Define GEval metrics
insightful = GEval(
    name="Insightfulness",
    criteria="Does the output offer a deep or non-obvious understanding?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)
novelty = GEval(
    name="Novelty",
    criteria="Does the output go beyond generic interpretation?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)
depth = GEval(
    name="Domain Depth",
    criteria="Does the analysis demonstrate deep domain expertise?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [39]:
json_file = load_dataset_dict("../data/sample/flag-1.json")

In [43]:
gt_insight = "\n".join(json_file["insights"])

test_case = LLMTestCase(
    input=gt_insight,
    actual_output=insights_sys_full
)

# Evaluate
results = evaluate(
    test_cases=[test_case],
    metrics=[insightful, novelty, depth],
)



Metrics Summary

  - ❌ Insightfulness [GEval] (score: 0.332082129433083, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The response does not directly address the primary idea of the input, which centers on hardware incidents, especially printer malfunctions in Australia, and the specific issue with 'Printer546'. Instead, it provides generic ITSM analysis, mentioning software, personnel, and general incident management concepts without offering nuanced or novel insights into the specific hardware and printer-related trends described. The output lacks a focused, value-adding perspective on the core input details., error: None)
  - ❌ Novelty [GEval] (score: 0.30774961594644334, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The response provides a generic analysis of IT service management data, mentioning incident categories, personnel, and management processes, but fails to address the specific details from the input such as the predominance of ha

In [45]:
for label, test_case_list in results:
    print(f"\n==== Label: {label} ====")

    if test_case_list is None:
        print("⚠️  No results for this label.\n")
        continue

    for test_case in test_case_list:
        print(f"Input:  {test_case.input}")
        print(f"Output: {test_case.actual_output[:300]}...")

        for metric in test_case.metrics_data:
            print(f"{metric.name:<25}: {metric.score:.2f}  |  {metric.reason}")


==== Label: test_results ====
Input:  The Hardware incidents is significantly higher in volume than others
Specific hardware issues related to Printer Malfunctioning are predominantly mentioned in incident descriptions
Most of the hardware incidents are related to printer issues
Most of the hardware incidents are occurring in the Australia location
There is not a significant increase in hardware incidents over time, it is relatively stable and higher than others.
Printer with id 'Printer546' is causing the most issues
Output: The dataset provides a snapshot of IT service management incidents, focusing on categories like Software and Hardware, with a predominant state of 'Closed'. The data reveals a high frequency of incidents in Australia, managed by a few key personnel such as Fred Luddy and Beth Anglin. The priority le...
Insightfulness [GEval]   : 0.33  |  The response does not directly address the primary idea of the input, which centers on hardware incidents, especially printer m