In [508]:
import pandas as pd
import os
import numpy as np

In [509]:
def calculate_average_cycle_time(file_path):
    """
    Reads a simulation log and calculates the average cycle time per case.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Warning: File not found at {file_path}")
        return None

    if df.empty:
        print(f"Warning: File is empty at {file_path}")
        return None

    # Ensure timestamp columns are in datetime format
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601', errors='coerce')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601', errors='coerce')

    # Group by case_id to find the start of the first activity and end of the last
    case_times = df.groupby('case_id').agg(
        case_start=('start_timestamp', 'min'),
        case_end=('end_timestamp', 'max')
    )

    # Calculate cycle time for each case
    case_times['cycle_time'] = case_times['case_end'] - case_times['case_start']
    
    # Return the average cycle time for the entire log
    return case_times['cycle_time'].mean()

def calculate_case_cycle_times(df):
    """Returns a dictionary of {case_id: cycle_time} from the log."""
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601', errors='coerce')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601', errors='coerce')

    case_times = {}
    for case_id, group in df.groupby('case_id'):
        start = group['start_timestamp'].min()
        end = group['end_timestamp'].max()
        if pd.notna(start) and pd.notna(end):
            case_times[case_id] = (end - start).total_seconds()
    return case_times



In [510]:
def comparison_across_logs():
    """
    Compares cycle times between baseline and optimized simulation logs.
    Returns overall improvements and a dict of case-level improvements.
    """
    log_name = 'LoanApp.csv' 
    
    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous')

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return
        
    print(f"Comparing {num_simulations} simulation logs from '{log_name}'...\n")

    all_improvements = []
    case_improvements = {}

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            print(f"--- Log {i} ---\n  Could not compare due to missing file(s).\n")
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except Exception as e:
            print(f"--- Log {i} ---\n  Error reading files: {e}\n")
            continue

        baseline_cases = calculate_case_cycle_times(df_base)
        optimized_cases = calculate_case_cycle_times(df_opt)

        per_log_improvements = []
        for case_id in baseline_cases:
            if case_id in optimized_cases and baseline_cases[case_id] > 0:
                improvement = ((baseline_cases[case_id] - optimized_cases[case_id]) / baseline_cases[case_id]) * 100
                per_log_improvements.append(improvement)
                case_improvements[(i, case_id)] = improvement  # (log_number, case_id) => improvement

        if per_log_improvements:
            avg_baseline_time = np.mean(list(baseline_cases.values()))
            avg_optimized_time = np.mean(list(optimized_cases.values()))
            improvement_percent = ((avg_baseline_time - avg_optimized_time) / avg_baseline_time) * 100
            all_improvements.append(improvement_percent)

            print(f"--- Log {i} ---")
            print(f"  Baseline Avg. Cycle Time:  {avg_baseline_time}")
            print(f"  Optimized Avg. Cycle Time: {avg_optimized_time}")
            print(f"  Improvement: {improvement_percent:.2f}%\n")
        else:
            print(f"--- Log {i} ---\n  No valid case matches found.\n")

    if all_improvements:
        overall_avg_improvement = np.mean(all_improvements)
        print("="*40)
        print(f"Overall Average Improvement: {overall_avg_improvement:.2f}%")
        print("="*40)

    return all_improvements, case_improvements


In [511]:
def compararison_across_cases():
    """
    Aggregates and compares mean and median cycle times per case_id across all logs.
    Prints per-case statistics and overall average/median improvement.
    """
    log_name = 'LoanApp.csv'

    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous')

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return

    # Collect case times across all logs
    all_baseline = {}
    all_optimized = {}

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except:
            continue

        baseline_cases = calculate_case_cycle_times(df_base)
        optimized_cases = calculate_case_cycle_times(df_opt)

        for case_id, time in baseline_cases.items():
            all_baseline.setdefault(case_id, []).append(time)

        for case_id, time in optimized_cases.items():
            all_optimized.setdefault(case_id, []).append(time)

    # Now compare
    common_cases = set(all_baseline.keys()) & set(all_optimized.keys())

    print("Case-by-case aggregated stats:\n")

    mean_improvements = []
    median_improvements = []

    for case_id in sorted(common_cases):
        baseline_times = all_baseline[case_id]
        optimized_times = all_optimized[case_id]

        if not baseline_times or not optimized_times:
            continue

        mean_base = np.mean(baseline_times)
        mean_opt = np.mean(optimized_times)
        median_base = np.median(baseline_times)
        median_opt = np.median(optimized_times)

        if mean_base > 0:
            mean_improvement = ((mean_base - mean_opt) / mean_base) * 100
            mean_improvements.append(mean_improvement)
        else:
            mean_improvement = 0

        if median_base > 0:
            median_improvement = ((median_base - median_opt) / median_base) * 100
            median_improvements.append(median_improvement)
        else:
            median_improvement = 0

        print(f"Case ID: {case_id}")
        print(f"  Mean Baseline:  {mean_base:.2f}")
        print(f"  Mean Optimized: {mean_opt:.2f}")
        print(f"  Mean Improvement: {mean_improvement:.2f}%")
        print(f"  Median Baseline:  {median_base:.2f}")
        print(f"  Median Optimized: {median_opt:.2f}")
        print(f"  Median Improvement: {median_improvement:.2f}%\n")

    # Print overall stats
    if mean_improvements:
        overall_mean_improvement = np.mean(mean_improvements)
        overall_median_improvement = np.median(median_improvements)

        print("="*40)
        print(f"Overall Mean Improvement:   {overall_mean_improvement:.2f}%")
        print(f"Overall Median Improvement: {overall_median_improvement:.2f}%")
        print("="*40)


In [512]:
# --- CONFIGURATION ---
# Define the cost per hour for each resource.
# The script will assign a cost of 0 to any resource not listed here.
AGENT_COSTS = {
    "Clerk-000006": 90,
    "Clerk-000001": 30,
    "Applicant-000001": 0,
    "Clerk-000007": 30,
    "Clerk-000004": 90,
    "Clerk-000003": 60,
    "Clerk-000008": 30,
    "Senior Officer-000002": 150,
    "Appraiser-000002": 90,
    "AML Investigator-000002": 110,
    "Appraiser-000001": 90,
    "Loan Officer-000002": 95,
    "AML Investigator-000001": 110,
    "Loan Officer-000001": 95,
    "Loan Officer-000004": 105,
    "Clerk-000002": 30,
    "Loan Officer-000003": 105,
    "Senior Officer-000001": 150,
    "Clerk-000005": 90
    }


def calculate_case_costs(df: pd.DataFrame, agent_costs: dict) -> dict:
    """
    Reads a simulation log DataFrame and calculates the total agent cost for each case.
    
    Args:
        df: The log DataFrame.
        agent_costs: A dictionary mapping resource names to their cost per hour.

    Returns:
        A dictionary mapping each case_id to its total cost.
    """
    if df.empty:
        return {}
    
    # Ensure timestamp columns are in datetime format
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601', errors='coerce')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601', errors='coerce')

    # Calculate the duration of each task in seconds
    df['duration_seconds'] = (df['end_timestamp'] - df['start_timestamp']).dt.total_seconds()

    # Map the cost per hour to each event, defaulting to 0 if not found
    df['cost_per_hour'] = df['resource'].map(agent_costs).fillna(0)

    # Calculate the cost for each individual task
    df['task_cost'] = (df['duration_seconds'] / 3600) * df['cost_per_hour']

    # Group by case_id and sum the costs to get the total cost per case
    case_total_costs = df.groupby('case_id')['task_cost'].sum()

    return case_total_costs.to_dict()


def compare_costs_across_logs():
    """
    Compares total agent costs between baseline and optimized simulation logs on a per-log basis.
    """
    log_name = 'LoanApp.csv' 
    
    # Adjust this path to match your project structure
    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous') # Updated to point to your optimized logs

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return
        
    print(f"Comparing costs for {num_simulations} simulation logs from '{log_name}'...\n")

    all_improvements = []

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            print(f"--- Log {i} ---\n  Could not compare due to missing file(s).\n")
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except Exception as e:
            print(f"--- Log {i} ---\n  Error reading files: {e}\n")
            continue

        baseline_costs = calculate_case_costs(df_base, AGENT_COSTS)
        optimized_costs = calculate_case_costs(df_opt, AGENT_COSTS)
        
        # Filter for cases present in both logs
        common_case_ids = set(baseline_costs.keys()) & set(optimized_costs.keys())
        
        if not common_case_ids:
            print(f"--- Log {i} ---\n  No common cases found to compare.\n")
            continue

        avg_baseline_cost = np.mean([baseline_costs[cid] for cid in common_case_ids])
        avg_optimized_cost = np.mean([optimized_costs[cid] for cid in common_case_ids])
        
        if avg_baseline_cost > 0:
            improvement_percent = ((avg_baseline_cost - avg_optimized_cost) / avg_baseline_cost) * 100
            all_improvements.append(improvement_percent)

            print(f"--- Log {i} ---")
            print(f"  Baseline Avg. Case Cost:  ${avg_baseline_cost:.2f}")
            print(f"  Optimized Avg. Case Cost: ${avg_optimized_cost:.2f}")
            print(f"  Cost Reduction: {improvement_percent:.2f}%\n")
        else:
            print(f"--- Log {i} ---\n  Baseline cost is zero, cannot calculate improvement.\n")

    if all_improvements:
        overall_avg_improvement = np.mean(all_improvements)
        print("="*40)
        print(f"Overall Average Cost Reduction: {overall_avg_improvement:.2f}%")
        print("="*40)


def compare_costs_across_cases():
    """
    Aggregates and compares mean and median costs per case_id across all logs.
    """
    log_name = 'LoanApp.csv'

    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous') # Updated path

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return

    # Collect case costs across all logs
    all_baseline_costs = {}
    all_optimized_costs = {}

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file): continue
        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except: continue

        baseline_cases = calculate_case_costs(df_base, AGENT_COSTS)
        optimized_cases = calculate_case_costs(df_opt, AGENT_COSTS)

        for case_id, cost in baseline_cases.items(): all_baseline_costs.setdefault(case_id, []).append(cost)
        for case_id, cost in optimized_cases.items(): all_optimized_costs.setdefault(case_id, []).append(cost)

    common_cases = set(all_baseline_costs.keys()) & set(all_optimized_costs.keys())
    print("Case-by-case aggregated cost stats:\n")

    mean_improvements = []
    median_improvements = []

    for case_id in sorted(common_cases):
        baseline_costs = all_baseline_costs[case_id]
        optimized_costs = all_optimized_costs[case_id]

        if not baseline_costs or not optimized_costs: continue

        mean_base = np.mean(baseline_costs)
        mean_opt = np.mean(optimized_costs)
        median_base = np.median(baseline_costs)
        median_opt = np.median(optimized_costs)

        mean_improvement = ((mean_base - mean_opt) / mean_base) * 100 if mean_base > 0 else 0
        median_improvement = ((median_base - median_opt) / median_base) * 100 if median_base > 0 else 0
        mean_improvements.append(mean_improvement)
        median_improvements.append(median_improvement)

        print(f"Case ID: {case_id}")
        print(f"  Mean Baseline Cost:  ${mean_base:.2f}")
        print(f"  Mean Optimized Cost: ${mean_opt:.2f}")
        print(f"  Mean Reduction: {mean_improvement:.2f}%")
        print(f"  Median Baseline Cost:  ${median_base:.2f}")
        print(f"  Median Optimized Cost: ${median_opt:.2f}")
        print(f"  Median Reduction: {median_improvement:.2f}%\n")

    if mean_improvements:
        overall_mean_improvement = np.mean(mean_improvements)
        overall_median_improvement = np.median(median_improvements)

        print("="*40)
        print(f"Overall Mean Cost Reduction:   {overall_mean_improvement:.2f}%")
        print(f"Overall Median Cost Reduction: {overall_median_improvement:.2f}%")
        print("="*40)

In [513]:
def add_activity_cost_column(logs_df: pd.DataFrame, agent_costs: dict) -> pd.DataFrame:
    """
    Enriches a log DataFrame with activity-level cost information.

    This function calculates the cost of each individual activity based on its
    duration and the hourly rate of the resource who performed it.

    Args:
        logs_df (pd.DataFrame): 
            The input DataFrame of simulation logs. It must contain the columns:
            'start_timestamp', 'end_timestamp', and 'resource'.
        
        agent_costs (dict): 
            A dictionary mapping resource names (str) to their cost per hour (float/int).

    Returns:
        pd.DataFrame: 
            A new DataFrame containing all original columns plus a new 
            'activity_cost' column. Returns an empty DataFrame if the input is empty.
    """
    if logs_df.empty:
        return logs_df.copy()

    # Create a copy to avoid modifying the original DataFrame in place
    df = logs_df.copy()

    # 1. Ensure timestamp columns are proper datetime objects
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601')

    # 2. Calculate the duration of each task in seconds
    df['duration_seconds'] = (df['end_timestamp'] - df['start_timestamp']).dt.total_seconds()

    # 3. Map the agent's cost per hour to each event row.
    #    .map() is a highly efficient way to do this.
    #    .fillna(0) ensures that any resource not in our agent_costs dict gets a cost of 0.
    df['cost_per_hour'] = df['resource'].map(agent_costs).fillna(0)

    # 4. Calculate the final cost for each activity
    #    (Duration in seconds / 3600 seconds per hour) * cost per hour
    df['activity_cost'] = (df['duration_seconds'] / 3600) * df['cost_per_hour']
    
    # Optional: You can drop the intermediate columns if you want a cleaner output
    # df = df.drop(columns=['duration_seconds', 'cost_per_hour'])

    return df

In [514]:
log_improvements, case_improvements = comparison_across_logs()

Comparing 10 simulation logs from 'LoanApp.csv'...

--- Log 0 ---
  Baseline Avg. Cycle Time:  50275.163768165
  Optimized Avg. Cycle Time: 43427.0523623
  Improvement: 13.62%

--- Log 1 ---
  Baseline Avg. Cycle Time:  56348.631561815
  Optimized Avg. Cycle Time: 49905.426406850005
  Improvement: 11.43%

--- Log 2 ---
  Baseline Avg. Cycle Time:  54611.17538516499
  Optimized Avg. Cycle Time: 52692.46970421501
  Improvement: 3.51%

--- Log 3 ---
  Baseline Avg. Cycle Time:  52161.806842785
  Optimized Avg. Cycle Time: 42229.66258994
  Improvement: 19.04%

--- Log 4 ---
  Baseline Avg. Cycle Time:  47820.89159405001
  Optimized Avg. Cycle Time: 40377.819716100006
  Improvement: 15.56%

--- Log 5 ---
  Baseline Avg. Cycle Time:  47887.91980021
  Optimized Avg. Cycle Time: 44609.299664130005
  Improvement: 6.85%

--- Log 6 ---
  Baseline Avg. Cycle Time:  52402.908601805
  Optimized Avg. Cycle Time: 40825.293545205
  Improvement: 22.09%

--- Log 7 ---
  Baseline Avg. Cycle Time:  55972.7

In [515]:
compararison_across_cases()

Case-by-case aggregated stats:

Case ID: 0
  Mean Baseline:  9998.62
  Mean Optimized: 8483.26
  Mean Improvement: 15.16%
  Median Baseline:  10295.61
  Median Optimized: 7013.87
  Median Improvement: 31.88%

Case ID: 1
  Mean Baseline:  10671.63
  Mean Optimized: 8095.43
  Mean Improvement: 24.14%
  Median Baseline:  10751.04
  Median Optimized: 8767.87
  Median Improvement: 18.45%

Case ID: 2
  Mean Baseline:  16381.52
  Mean Optimized: 9270.70
  Mean Improvement: 43.41%
  Median Baseline:  9643.76
  Median Optimized: 9625.00
  Median Improvement: 0.19%

Case ID: 3
  Mean Baseline:  7699.71
  Mean Optimized: 11331.30
  Mean Improvement: -47.17%
  Median Baseline:  7950.31
  Median Optimized: 10946.50
  Median Improvement: -37.69%

Case ID: 4
  Mean Baseline:  16440.36
  Mean Optimized: 10652.72
  Mean Improvement: 35.20%
  Median Baseline:  9095.94
  Median Optimized: 9059.36
  Median Improvement: 0.40%

Case ID: 5
  Mean Baseline:  17603.60
  Mean Optimized: 9578.14
  Mean Improveme

In [516]:
c = 0
tc = 0
for k,v in case_improvements.items():
    if v < 0:
        print(k,v)
        c=c+1
    tc = tc + 1
print(f"Total problematic cases {c}")
print(f"Total cases {tc}")


(0, 2) -73.65859282472478
(0, 3) -62.06380556591306
(0, 4) -142.5238444640616
(0, 5) -15.329876726745914
(0, 6) -7.67842260803399
(0, 7) -72.86502450509045
(0, 8) -1361.7046349748225
(0, 9) -928.8152319692738
(0, 10) -828.8260879481094
(0, 14) -1.0727054522686137
(0, 16) -39.48146639294299
(0, 18) -27.032818005075
(0, 25) -3883.459463336321
(0, 27) -2.3447385475862816
(0, 28) -5.348311957905844
(0, 30) -1.2028895094878982
(0, 31) -45.50651214112711
(0, 35) -32.48100682026206
(0, 40) -17.269655896066396
(0, 41) -0.13627231416419627
(0, 43) -1.2410814784860942
(0, 44) -1.626849973281382
(0, 45) -1.1380447285674773
(0, 49) -84.82406179363919
(0, 52) -37.99676633659143
(0, 54) -1154.1733505557484
(0, 56) -0.8779382459129762
(0, 58) -1.5567402795402223
(0, 59) -10.597193839604161
(0, 60) -4.594865102244207
(0, 61) -1.564688468829277
(0, 64) -18.78575087700633
(0, 66) -47.47788741849631
(0, 74) -3.7873173560858215
(0, 75) -1288.7697844881159
(0, 78) -0.7640885540243437
(0, 80) -4.23862250065

In [517]:
log_name = 'LoanApp.csv'

base_path = os.path.join('..', '..', 'simulated_data', log_name)
baseline_dir_path = os.path.join(base_path, 'baseline')
optimized_dir_path = os.path.join(base_path, 'autonomous') # Updated path

# Check if directories exist before running
if not os.path.exists(baseline_dir_path) or not os.path.exists(optimized_dir_path):
    print(f"FATAL ERROR: Could not find directories.")
    print(f"Checked for: '{baseline_dir_path}'")
    print(f"Checked for: '{optimized_dir_path}'")
    print("Please ensure the CONFIG section is correct and you have run the simulations.")
else:
    print("--- Analysis 1: Comparing Costs on a Per-Log Basis ---")
    compare_costs_across_logs()
        
    print("\n\n--- Analysis 2: Comparing Aggregated Costs on a Per-Case Basis ---")
    compare_costs_across_cases()

--- Analysis 1: Comparing Costs on a Per-Log Basis ---
Comparing costs for 10 simulation logs from 'LoanApp.csv'...

--- Log 0 ---
  Baseline Avg. Case Cost:  $1012.94
  Optimized Avg. Case Cost: $905.42
  Cost Reduction: 10.61%

--- Log 1 ---
  Baseline Avg. Case Cost:  $1081.75
  Optimized Avg. Case Cost: $1125.23
  Cost Reduction: -4.02%

--- Log 2 ---
  Baseline Avg. Case Cost:  $968.88
  Optimized Avg. Case Cost: $1205.72
  Cost Reduction: -24.44%

--- Log 3 ---
  Baseline Avg. Case Cost:  $1199.75
  Optimized Avg. Case Cost: $1072.75
  Cost Reduction: 10.59%

--- Log 4 ---
  Baseline Avg. Case Cost:  $923.37
  Optimized Avg. Case Cost: $958.81
  Cost Reduction: -3.84%

--- Log 5 ---
  Baseline Avg. Case Cost:  $1142.44
  Optimized Avg. Case Cost: $1060.85
  Cost Reduction: 7.14%

--- Log 6 ---
  Baseline Avg. Case Cost:  $1151.10
  Optimized Avg. Case Cost: $985.40
  Cost Reduction: 14.39%

--- Log 7 ---
  Baseline Avg. Case Cost:  $1049.25
  Optimized Avg. Case Cost: $1093.84
  

In [518]:
def calculate_total_wait_time(df: pd.DataFrame) -> float:
    """
    Calculates the total waiting time for all cases in a log file.

    Wait time is defined as the time between the end of the previous activity
    and the start of the current activity within the same case.

    Args:
        df (pd.DataFrame): The log DataFrame.

    Returns:
        float: The total waiting time in seconds for the entire log.
    """
    if df.empty:
        return 0.0

    # Ensure timestamp columns are in datetime format
    try:
        df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601')
        df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601')
    except Exception as e:
        print(f"  Warning: Could not parse timestamps. Error: {e}")
        return 0.0

    # Sort events to ensure correct chronological order within each case
    df = df.sort_values(by=['case_id', 'start_timestamp'])

    # Get the end time of the *previous* activity within the same case
    df['previous_end_timestamp'] = df.groupby('case_id')['end_timestamp'].shift(1)

    # The first activity in a case has no previous end time, so fill with its own start time
    df['previous_end_timestamp'].fillna(df['start_timestamp'], inplace=True)

    # Calculate wait time for each activity
    df['wait_time_seconds'] = (df['start_timestamp'] - df['previous_end_timestamp']).dt.total_seconds()
    
    # Ensure wait times are not negative (can happen with parallel activities)
    df['wait_time_seconds'] = df['wait_time_seconds'].clip(lower=0)
    
    # Return the sum of all wait times
    return df['wait_time_seconds'].sum()


# ==============================================================================
# --- NEW BENCHMARKING FUNCTION ---
# ==============================================================================

def compare_wait_times_across_logs(baseline_dir, optimized_dir, LOG_NAME):
    """
    Compares total wait times between baseline and optimized simulation logs.
    """
    print(f"Comparing total wait times for {LOG_NAME}...\n")
    all_improvements = []

    try:
        baseline_files = [f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')]
        num_simulations = len(baseline_files)
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            print(f"--- Log {i} ---\n  Could not compare due to missing file(s).\n")
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except Exception as e:
            print(f"--- Log {i} ---\n  Error reading files: {e}\n")
            continue

        total_wait_base = calculate_total_wait_time(df_base)
        total_wait_opt = calculate_total_wait_time(df_opt)

        if total_wait_base > 0:
            # Improvement is a REDUCTION in wait time
            improvement_percent = ((total_wait_base - total_wait_opt) / total_wait_base) * 100
            all_improvements.append(improvement_percent)

            print(f"--- Log {i} ---")
            print(f"  Baseline Total Wait Time:  {total_wait_base / 3600:.2f} hours")
            print(f"  Optimized Total Wait Time: {total_wait_opt / 3600:.2f} hours")
            print(f"  Wait Time Reduction: {improvement_percent:.2f}%\n")
        else:
            print(f"--- Log {i} ---\n  Baseline wait time is zero, cannot calculate improvement.\n")

    if all_improvements:
        overall_avg_improvement = np.mean(all_improvements)
        print("="*40)
        print(f"Overall Average Wait Time Reduction: {overall_avg_improvement:.2f}%")
        print("="*40)

# ==============================================================================
# --- MAIN EXECUTION BLOCK ---
# ==============================================================================

In [519]:
log_name = 'LoanApp.csv' 
base_path = os.path.join('..', '..', 'simulated_data', log_name)
baseline_dir_path = os.path.join(base_path, 'baseline')
optimized_dir_path = os.path.join(base_path, 'autonomous')


    
base_path = os.path.join('..', '..', 'simulated_data', log_name)
optimized_dir = os.path.join(base_path, 'autonomous')

    # Check if directories exist before running
if not os.path.exists(baseline_dir_path) or not os.path.exists(optimized_dir_path):
    print(f"FATAL ERROR: Could not find directories.")
    print(f"Checked for: '{baseline_dir_path}'")
    print(f"Checked for: '{optimized_dir_path}'")
    print("Please ensure the CONFIG section is correct and you have run the simulations.")
else:
    compare_wait_times_across_logs(baseline_dir_path, optimized_dir_path, 'LoanApp.csv')

Comparing total wait times for LoanApp.csv...

--- Log 0 ---
  Baseline Total Wait Time:  717.40 hours
  Optimized Total Wait Time: 563.02 hours
  Wait Time Reduction: 21.52%

--- Log 1 ---
  Baseline Total Wait Time:  818.01 hours
  Optimized Total Wait Time: 579.25 hours
  Wait Time Reduction: 29.19%

--- Log 2 ---
  Baseline Total Wait Time:  1012.35 hours
  Optimized Total Wait Time: 536.74 hours
  Wait Time Reduction: 46.98%

--- Log 3 ---
  Baseline Total Wait Time:  470.36 hours
  Optimized Total Wait Time: 212.50 hours
  Wait Time Reduction: 54.82%

--- Log 4 ---
  Baseline Total Wait Time:  601.06 hours
  Optimized Total Wait Time: 355.53 hours
  Wait Time Reduction: 40.85%

--- Log 5 ---
  Baseline Total Wait Time:  295.17 hours
  Optimized Total Wait Time: 288.36 hours
  Wait Time Reduction: 2.31%

--- Log 6 ---
  Baseline Total Wait Time:  494.56 hours
  Optimized Total Wait Time: 308.85 hours
  Wait Time Reduction: 37.55%

--- Log 7 ---
  Baseline Total Wait Time:  899.53 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['previous_end_timestamp'].fillna(df['start_timestamp'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['previous_end_timestamp'].fillna(df['start_timestamp'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because th

In [520]:
log_name = 'LoanApp.csv' 
    
base_path = os.path.join('..', '..', 'simulated_data', log_name)
optimized_dir = os.path.join(base_path, 'autonomous')


optimized_logs= []
for i in range(10):
    optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')
    optimized_df = pd.read_csv(optimized_file)
    optimized_logs.append(optimized_df)

baseline_dir = os.path.join(base_path, 'baseline')


baseline_logs= []
for i in range(10):
    baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
    baseline_df = pd.read_csv(baseline_file)
    baseline_logs.append(baseline_df)

In [521]:
optimized_logs_0_cost = add_activity_cost_column(optimized_logs[0], AGENT_COSTS)
baseline_logs_0_cost = add_activity_cost_column(baseline_logs[0], AGENT_COSTS)

In [506]:
optimized_logs_0_cost[optimized_logs_0_cost['case_id']==50]

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource,duration_seconds,cost_per_hour,activity_cost
346,50,9,Check application form completeness,2023-04-25 09:00:00+00:00,2023-04-25 09:03:54.726464335+00:00,384,Clerk-000006,234.726464,90,5.868162
348,50,3,Appraise property,2023-04-25 09:03:54.726464335+00:00,2023-04-25 09:25:32.309296008+00:00,386,Appraiser-000001,1297.582832,90,32.439571
352,50,0,Check credit history,2023-04-25 09:25:32.309296008+00:00,2023-04-25 09:26:02.540782713+00:00,390,Clerk-000001,30.231487,30,0.251929
357,50,7,AML check,2023-04-25 09:26:02.540782713+00:00,2023-04-25 09:39:15.453701600+00:00,395,AML Investigator-000002,792.912919,110,24.227895
363,50,13,Assess loan risk,2023-04-25 09:39:15.453701600+00:00,2023-04-25 09:59:15.453701600+00:00,401,Loan Officer-000002,1200.0,95,31.666667
368,50,13,Design loan offer,2023-04-25 09:59:15.453701600+00:00,2023-04-25 10:03:54.191550507+00:00,406,Loan Officer-000002,278.737849,95,7.355582
378,50,14,Approve loan offer,2023-04-25 10:03:54.191550507+00:00,2023-04-25 10:23:54.191550507+00:00,416,Senior Officer-000001,1200.0,150,50.0
385,50,10,Cancel application,2023-04-25 10:23:54.191550507+00:00,2023-04-25 10:28:54.191550507+00:00,423,Clerk-000007,300.0,30,2.5


In [507]:
baseline_logs_0_cost[baseline_logs_0_cost['case_id']==194]

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource,duration_seconds,cost_per_hour,activity_cost
1429,194,4,Check application form completeness,2023-05-08 09:00:00+00:00,2023-05-08 09:02:46.537850782+00:00,1512,Clerk-000003,166.537851,60,2.775631
1432,194,7,AML check,2023-05-08 09:02:46.537850782+00:00,2023-05-08 09:04:51.624568721+00:00,1515,AML Investigator-000002,125.086718,110,3.822094
1438,194,0,Check credit history,2023-05-08 09:04:51.624568721+00:00,2023-05-08 09:25:23.351810765+00:00,1521,Clerk-000001,1231.727242,30,10.264394
1443,194,6,Appraise property,2023-05-08 10:11:34.354388136+00:00,2023-05-08 11:09:57.530088552+00:00,1526,Appraiser-000002,3503.1757,90,87.579393
1453,194,11,Assess loan risk,2023-05-08 11:09:57.530088552+00:00,2023-05-08 11:29:57.530088552+00:00,1536,Loan Officer-000001,1200.0,95,31.666667
1457,194,13,Design loan offer,2023-05-08 11:29:57.530088552+00:00,2023-05-08 11:30:28.916392668+00:00,1540,Loan Officer-000002,31.386304,95,0.82825
1460,194,17,Approve loan offer,2023-05-08 11:30:28.916392668+00:00,2023-05-08 11:50:28.916392668+00:00,1544,Senior Officer-000002,1200.0,150,50.0
1465,194,1,Approve application,2023-05-08 11:50:28.916392668+00:00,2023-05-08 11:55:28.916392668+00:00,1551,Clerk-000002,300.0,30,2.5
