### EXP 01: All Data Pipeline

In [8]:
# This script shows how to call `run_batch_analysis_from_folder` in a pipeline
# and save / print the results with detailed history visualization

import os
import sys
from glob import glob
from typing import List
import json
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# # ------------------------------------------------------------------
# # 1. Make sure Python can find your agent package / module
# #    (adapt the path if your repo layout is different)
# # ------------------------------------------------------------------
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
sys.path.append(ROOT_DIR)

from agents.d2insight_agent_sys import run_domain_detector   # noqa: E402

In [5]:
def visualize_history(result):
    """
    Visualize the domain detection history across iterations.
    
    Args:
        result: The result from run_domain_detector
    """
    if not result or "history" not in result or not result["history"]:
        print("No history data available")
        return
    
    # Convert history to DataFrame for easier manipulation
    history_df = pd.DataFrame(result["history"])
    
    # Print a nice tabular summary
    print("\n=== DOMAIN DETECTION HISTORY ===")
    
    for i, entry in enumerate(result["history"]):
        iteration = entry.get("iteration", i)
        print(f"\n📊 ITERATION {iteration}")
        print(f"📌 Domain: {entry.get('domain', 'Unknown')}")
        
        # Format scores if they exist
        scores = entry.get("scores", {})
        if scores:
            print("📈 Scores:")
            for score_name, score_value in scores.items():
                stars = "★" * score_value + "☆" * (4 - score_value)
                print(f"   {score_name.ljust(15)}: {stars} ({score_value}/4)")
                
        # Print analysis snippet
        analysis_head = entry.get("analysis_head", "")
        if analysis_head:
            print(f"📝 Analysis: {analysis_head}...")
            
    # Plot score evolution if more than one iteration with scores
    scored_entries = [entry for entry in result["history"] if entry.get("scores")]
    if len(scored_entries) > 1:
        plt.figure(figsize=(10, 6))
        
        # Prepare data for plotting
        iterations = []
        score_data = {}
        
        for entry in scored_entries:
            iter_num = entry.get("iteration", 0)
            iterations.append(iter_num)
            
            for score_name, score_value in entry.get("scores", {}).items():
                if score_name not in score_data:
                    score_data[score_name] = []
                score_data[score_name].append(score_value)
        
        # Plot each score metric
        for score_name, values in score_data.items():
            plt.plot(iterations, values, marker='o', label=score_name)
            
        plt.title("Evolution of Scores Across Iterations")
        plt.xlabel("Iteration")
        plt.ylabel("Score (0-4)")
        plt.ylim(0, 4.5)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.legend()
        plt.tight_layout()
        plt.xticks(iterations)  # Set x-axis ticks to integer iterations
        plt.show()

def run_and_visualize(csv_path, max_cycles=5):
    """
    Run domain detector and visualize results with history.
    
    Args:
        csv_path: Path to the CSV file
        max_cycles: Maximum number of improvement cycles
    
    Returns:
        The complete result from the domain detector
    """
    print(f"🚀 Running domain detector on {csv_path} with max_cycles={max_cycles}")
    
    # Run the domain detector
    result = run_domain_detector(csv_path, max_cycles)
    
    # Visualize history
    visualize_history(result)
    
    # Print final results
    if "analysis" in result:
        analysis = result["analysis"]
        print("\n=== FINAL RESULTS ===")
        print(f"✅ Domain: {analysis.get('domain', 'Unknown')}")
        print(f"✅ Core Concepts: {', '.join(analysis.get('core_concepts', []))}")
        
        if "analysis" in analysis:
            print("\n📊 DESCRIPTIVE ANALYSIS:")
            print(analysis["analysis"].get("descriptive", ""))
            
            print("\n🔮 PREDICTIVE ANALYSIS:")
            print(analysis["analysis"].get("predictive", ""))
            
            print("\n🌐 DOMAIN-RELATED ANALYSIS:")
            print(analysis["analysis"].get("domain_related", ""))
    
    return result

### Running the Pipeline

In [6]:
def run_batch_analysis_from_folder(folder_path: str, max_cycles: int = 5) -> pd.DataFrame:
    """
    Run domain detector on all CSV files in a folder.
    
    Args:
        folder_path: Path to the folder containing CSV files
        max_cycles: Max improvement cycles per dataset
    
    Returns:
        DataFrame with summary results
    """
    csv_paths = glob(os.path.join(folder_path, "*.csv"))
    print(f"📁 Found {len(csv_paths)} CSV files in {folder_path}")
    
    all_results = []

    for path in csv_paths:
        print(f"\n📂 Analyzing: {os.path.basename(path)}")
        result = run_domain_detector(path, max_cycles=max_cycles)
        
        # Extract relevant fields safely
        analysis = result.get("analysis", {})
        domain = analysis.get("domain", "Unknown")
        concepts = ", ".join(analysis.get("core_concepts", []))
        desc = analysis.get("analysis", {}).get("descriptive", "")
        pred = analysis.get("analysis", {}).get("predictive", "")
        domain_rel = analysis.get("analysis", {}).get("domain_related", "")
        
        all_results.append({
            "filename": os.path.basename(path),
            "domain": domain,
            "core_concepts": concepts,
            "descriptive": desc,
            "predictive": pred,
            "domain_related": domain_rel,
            "final_scores": result.get("scores", {}),
            "num_iterations": result.get("iteration", 0),
            "history": result.get("history", [])
        })

    return pd.DataFrame(all_results)

In [9]:
results_df = run_batch_analysis_from_folder("../dataset", max_cycles=5)

# Save or inspect
results_df.to_csv("batch_results.csv", index=False)

📁 Found 3 CSV files in ../dataset

📂 Analyzing: Finance_survey_data.csv
Successfully read CSV: 40 rows, 24 columns
Data profile built successfully
Starting analysis with max_cycles=5
domain_node {'profile': {'raw': {'n_rows': 40, 'n_cols': 24, 'columns': {'gender': {'dtype': 'object', 'unique_ratio': 0.05, 'sample': ['Female', 'Female', 'Male', 'Male', 'Female']}, 'age': {'dtype': 'int64', 'unique_ratio': 0.35, 'sample': ['34', '23', '30', '22', '24'], 'min': np.int64(21), 'max': np.int64(35), 'mean': np.float64(27.8), 'std': np.float64(3.5605)}, 'Investment_Avenues': {'dtype': 'object', 'unique_ratio': 0.05, 'sample': ['Yes', 'Yes', 'Yes', 'Yes', 'No']}, 'Mutual_Funds': {'dtype': 'int64', 'unique_ratio': 0.15, 'sample': ['1', '4', '3', '2', '2'], 'min': np.int64(1), 'max': np.int64(7), 'mean': np.float64(2.55), 'std': np.float64(1.1972)}, 'Equity_Market': {'dtype': 'int64', 'unique_ratio': 0.15, 'sample': ['2', '3', '6', '1', '1'], 'min': np.int64(1), 'max': np.int64(6), 'mean': np.fl

In [10]:
results_df

Unnamed: 0,filename,domain,core_concepts,descriptive,predictive,domain_related,final_scores,num_iterations,history
0,Finance_survey_data.csv,Personal Finance,"Budgeting Strategies, Investment Diversificati...",The dataset provides a comprehensive overview ...,Given the current investment preferences and f...,"In the context of personal finance, the data h...","{'correctness': 4, 'relevance': 3, 'coverage':...",4,"[{'iteration': 0, 'domain': 'Unknown', 'scores..."
1,flag-25-Expense Processing Dynamics Analysis .csv,Expense Management,"Expense Reporting Systems, Employee Reimbursem...",The dataset provides a comprehensive view of e...,Given the current trends in expense management...,"In the context of expense management, the data...","{'correctness': 4, 'relevance': 3, 'coverage':...",4,"[{'iteration': 0, 'domain': 'Unknown', 'scores..."
2,stimhartnow-detailed-customer-data-9.csv,Customer Relationship Management (CRM),"Customer Segmentation, Customer Lifetime Value...",The dataset provides a comprehensive view of c...,"Leveraging the dataset's attributes, predictiv...","In the CRM domain, understanding customer dyna...","{'correctness': 4, 'relevance': 3, 'coverage':...",4,"[{'iteration': 0, 'domain': 'Unknown', 'scores..."
