In [70]:
import pandas as pd
import os
import numpy as np

In [71]:
def calculate_average_cycle_time(file_path):
    """
    Reads a simulation log and calculates the average cycle time per case.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Warning: File not found at {file_path}")
        return None

    if df.empty:
        print(f"Warning: File is empty at {file_path}")
        return None

    # Ensure timestamp columns are in datetime format
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601', errors='coerce')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601', errors='coerce')

    # Group by case_id to find the start of the first activity and end of the last
    case_times = df.groupby('case_id').agg(
        case_start=('start_timestamp', 'min'),
        case_end=('end_timestamp', 'max')
    )

    # Calculate cycle time for each case
    case_times['cycle_time'] = case_times['case_end'] - case_times['case_start']
    
    # Return the average cycle time for the entire log
    return case_times['cycle_time'].mean()

def calculate_case_cycle_times(df):
    """Returns a dictionary of {case_id: cycle_time} from the log."""
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601', errors='coerce')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601', errors='coerce')

    case_times = {}
    for case_id, group in df.groupby('case_id'):
        start = group['start_timestamp'].min()
        end = group['end_timestamp'].max()
        if pd.notna(start) and pd.notna(end):
            case_times[case_id] = (end - start).total_seconds()
    return case_times



In [72]:
def comparison_across_logs():
    """
    Compares cycle times between baseline and optimized simulation logs.
    Returns overall improvements and a dict of case-level improvements.
    """
    log_name = 'LoanApp.csv' 
    
    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous')

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return
        
    print(f"Comparing {num_simulations} simulation logs from '{log_name}'...\n")

    all_improvements = []
    case_improvements = {}

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            print(f"--- Log {i} ---\n  Could not compare due to missing file(s).\n")
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except Exception as e:
            print(f"--- Log {i} ---\n  Error reading files: {e}\n")
            continue

        baseline_cases = calculate_case_cycle_times(df_base)
        optimized_cases = calculate_case_cycle_times(df_opt)

        per_log_improvements = []
        for case_id in baseline_cases:
            if case_id in optimized_cases and baseline_cases[case_id] > 0:
                improvement = ((baseline_cases[case_id] - optimized_cases[case_id]) / baseline_cases[case_id]) * 100
                per_log_improvements.append(improvement)
                case_improvements[(i, case_id)] = improvement  # (log_number, case_id) => improvement

        if per_log_improvements:
            avg_baseline_time = np.mean(list(baseline_cases.values()))
            avg_optimized_time = np.mean(list(optimized_cases.values()))
            improvement_percent = ((avg_baseline_time - avg_optimized_time) / avg_baseline_time) * 100
            all_improvements.append(improvement_percent)

            print(f"--- Log {i} ---")
            print(f"  Baseline Avg. Cycle Time:  {avg_baseline_time}")
            print(f"  Optimized Avg. Cycle Time: {avg_optimized_time}")
            print(f"  Improvement: {improvement_percent:.2f}%\n")
        else:
            print(f"--- Log {i} ---\n  No valid case matches found.\n")

    if all_improvements:
        overall_avg_improvement = np.mean(all_improvements)
        print("="*40)
        print(f"Overall Average Improvement: {overall_avg_improvement:.2f}%")
        print("="*40)

    return all_improvements, case_improvements


In [73]:
def compararison_across_cases():
    """
    Aggregates and compares mean and median cycle times per case_id across all logs.
    Prints per-case statistics and overall average/median improvement.
    """
    log_name = 'LoanApp.csv'

    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous')

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return

    # Collect case times across all logs
    all_baseline = {}
    all_optimized = {}

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except:
            continue

        baseline_cases = calculate_case_cycle_times(df_base)
        optimized_cases = calculate_case_cycle_times(df_opt)

        for case_id, time in baseline_cases.items():
            all_baseline.setdefault(case_id, []).append(time)

        for case_id, time in optimized_cases.items():
            all_optimized.setdefault(case_id, []).append(time)

    # Now compare
    common_cases = set(all_baseline.keys()) & set(all_optimized.keys())

    print("Case-by-case aggregated stats:\n")

    mean_improvements = []
    median_improvements = []

    for case_id in sorted(common_cases):
        baseline_times = all_baseline[case_id]
        optimized_times = all_optimized[case_id]

        if not baseline_times or not optimized_times:
            continue

        mean_base = np.mean(baseline_times)
        mean_opt = np.mean(optimized_times)
        median_base = np.median(baseline_times)
        median_opt = np.median(optimized_times)

        if mean_base > 0:
            mean_improvement = ((mean_base - mean_opt) / mean_base) * 100
            mean_improvements.append(mean_improvement)
        else:
            mean_improvement = 0

        if median_base > 0:
            median_improvement = ((median_base - median_opt) / median_base) * 100
            median_improvements.append(median_improvement)
        else:
            median_improvement = 0

        print(f"Case ID: {case_id}")
        print(f"  Mean Baseline:  {mean_base:.2f}")
        print(f"  Mean Optimized: {mean_opt:.2f}")
        print(f"  Mean Improvement: {mean_improvement:.2f}%")
        print(f"  Median Baseline:  {median_base:.2f}")
        print(f"  Median Optimized: {median_opt:.2f}")
        print(f"  Median Improvement: {median_improvement:.2f}%\n")

    # Print overall stats
    if mean_improvements:
        overall_mean_improvement = np.mean(mean_improvements)
        overall_median_improvement = np.median(median_improvements)

        print("="*40)
        print(f"Overall Mean Improvement:   {overall_mean_improvement:.2f}%")
        print(f"Overall Median Improvement: {overall_median_improvement:.2f}%")
        print("="*40)


In [74]:
# --- CONFIGURATION ---
# Define the cost per hour for each resource.
# The script will assign a cost of 0 to any resource not listed here.
AGENT_COSTS = {
    "Clerk-000006": 90,
    "Clerk-000001": 30,
    "Applicant-000001": 0,
    "Clerk-000007": 30,
    "Clerk-000004": 90,
    "Clerk-000003": 60,
    "Clerk-000008": 30,
    "Senior Officer-000002": 150,
    "Appraiser-000002": 90,
    "AML Investigator-000002": 110,
    "Appraiser-000001": 90,
    "Loan Officer-000002": 95,
    "AML Investigator-000001": 110,
    "Loan Officer-000001": 95,
    "Loan Officer-000004": 105,
    "Clerk-000002": 30,
    "Loan Officer-000003": 105,
    "Senior Officer-000001": 150,
    "Clerk-000005": 90
    }


def calculate_case_costs(df: pd.DataFrame, agent_costs: dict) -> dict:
    """
    Reads a simulation log DataFrame and calculates the total agent cost for each case.
    
    Args:
        df: The log DataFrame.
        agent_costs: A dictionary mapping resource names to their cost per hour.

    Returns:
        A dictionary mapping each case_id to its total cost.
    """
    if df.empty:
        return {}
    
    # Ensure timestamp columns are in datetime format
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601', errors='coerce')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601', errors='coerce')

    # Calculate the duration of each task in seconds
    df['duration_seconds'] = (df['end_timestamp'] - df['start_timestamp']).dt.total_seconds()

    # Map the cost per hour to each event, defaulting to 0 if not found
    df['cost_per_hour'] = df['resource'].map(agent_costs).fillna(0)

    # Calculate the cost for each individual task
    df['task_cost'] = (df['duration_seconds'] / 3600) * df['cost_per_hour']

    # Group by case_id and sum the costs to get the total cost per case
    case_total_costs = df.groupby('case_id')['task_cost'].sum()

    return case_total_costs.to_dict()


def compare_costs_across_logs():
    """
    Compares total agent costs between baseline and optimized simulation logs on a per-log basis.
    """
    log_name = 'LoanApp.csv' 
    
    # Adjust this path to match your project structure
    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous') # Updated to point to your optimized logs

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return
        
    print(f"Comparing costs for {num_simulations} simulation logs from '{log_name}'...\n")

    all_improvements = []

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            print(f"--- Log {i} ---\n  Could not compare due to missing file(s).\n")
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except Exception as e:
            print(f"--- Log {i} ---\n  Error reading files: {e}\n")
            continue

        baseline_costs = calculate_case_costs(df_base, AGENT_COSTS)
        optimized_costs = calculate_case_costs(df_opt, AGENT_COSTS)
        
        # Filter for cases present in both logs
        common_case_ids = set(baseline_costs.keys()) & set(optimized_costs.keys())
        
        if not common_case_ids:
            print(f"--- Log {i} ---\n  No common cases found to compare.\n")
            continue

        avg_baseline_cost = np.mean([baseline_costs[cid] for cid in common_case_ids])
        avg_optimized_cost = np.mean([optimized_costs[cid] for cid in common_case_ids])
        
        if avg_baseline_cost > 0:
            improvement_percent = ((avg_baseline_cost - avg_optimized_cost) / avg_baseline_cost) * 100
            all_improvements.append(improvement_percent)

            print(f"--- Log {i} ---")
            print(f"  Baseline Avg. Case Cost:  ${avg_baseline_cost:.2f}")
            print(f"  Optimized Avg. Case Cost: ${avg_optimized_cost:.2f}")
            print(f"  Cost Reduction: {improvement_percent:.2f}%\n")
        else:
            print(f"--- Log {i} ---\n  Baseline cost is zero, cannot calculate improvement.\n")

    if all_improvements:
        overall_avg_improvement = np.mean(all_improvements)
        print("="*40)
        print(f"Overall Average Cost Reduction: {overall_avg_improvement:.2f}%")
        print("="*40)


def compare_costs_across_cases():
    """
    Aggregates and compares mean and median costs per case_id across all logs.
    """
    log_name = 'LoanApp.csv'

    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir = os.path.join(base_path, 'baseline')
    optimized_dir = os.path.join(base_path, 'autonomous') # Updated path

    if not os.path.exists(baseline_dir) or not os.path.exists(optimized_dir):
        print(f"Error: Make sure both '{baseline_dir}' and '{optimized_dir}' exist.")
        return

    try:
        num_simulations = len([f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')])
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return

    # Collect case costs across all logs
    all_baseline_costs = {}
    all_optimized_costs = {}

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file): continue
        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except: continue

        baseline_cases = calculate_case_costs(df_base, AGENT_COSTS)
        optimized_cases = calculate_case_costs(df_opt, AGENT_COSTS)

        for case_id, cost in baseline_cases.items(): all_baseline_costs.setdefault(case_id, []).append(cost)
        for case_id, cost in optimized_cases.items(): all_optimized_costs.setdefault(case_id, []).append(cost)

    common_cases = set(all_baseline_costs.keys()) & set(all_optimized_costs.keys())
    print("Case-by-case aggregated cost stats:\n")

    mean_improvements = []
    median_improvements = []

    for case_id in sorted(common_cases):
        baseline_costs = all_baseline_costs[case_id]
        optimized_costs = all_optimized_costs[case_id]

        if not baseline_costs or not optimized_costs: continue

        mean_base = np.mean(baseline_costs)
        mean_opt = np.mean(optimized_costs)
        median_base = np.median(baseline_costs)
        median_opt = np.median(optimized_costs)

        mean_improvement = ((mean_base - mean_opt) / mean_base) * 100 if mean_base > 0 else 0
        median_improvement = ((median_base - median_opt) / median_base) * 100 if median_base > 0 else 0
        mean_improvements.append(mean_improvement)
        median_improvements.append(median_improvement)

        print(f"Case ID: {case_id}")
        print(f"  Mean Baseline Cost:  ${mean_base:.2f}")
        print(f"  Mean Optimized Cost: ${mean_opt:.2f}")
        print(f"  Mean Reduction: {mean_improvement:.2f}%")
        print(f"  Median Baseline Cost:  ${median_base:.2f}")
        print(f"  Median Optimized Cost: ${median_opt:.2f}")
        print(f"  Median Reduction: {median_improvement:.2f}%\n")

    if mean_improvements:
        overall_mean_improvement = np.mean(mean_improvements)
        overall_median_improvement = np.median(median_improvements)

        print("="*40)
        print(f"Overall Mean Cost Reduction:   {overall_mean_improvement:.2f}%")
        print(f"Overall Median Cost Reduction: {overall_median_improvement:.2f}%")
        print("="*40)

In [75]:
def add_activity_cost_column(logs_df: pd.DataFrame, agent_costs: dict) -> pd.DataFrame:
    """
    Enriches a log DataFrame with activity-level cost information.

    This function calculates the cost of each individual activity based on its
    duration and the hourly rate of the resource who performed it.

    Args:
        logs_df (pd.DataFrame): 
            The input DataFrame of simulation logs. It must contain the columns:
            'start_timestamp', 'end_timestamp', and 'resource'.
        
        agent_costs (dict): 
            A dictionary mapping resource names (str) to their cost per hour (float/int).

    Returns:
        pd.DataFrame: 
            A new DataFrame containing all original columns plus a new 
            'activity_cost' column. Returns an empty DataFrame if the input is empty.
    """
    if logs_df.empty:
        return logs_df.copy()

    # Create a copy to avoid modifying the original DataFrame in place
    df = logs_df.copy()

    # 1. Ensure timestamp columns are proper datetime objects
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601')
    df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601')

    # 2. Calculate the duration of each task in seconds
    df['duration_seconds'] = (df['end_timestamp'] - df['start_timestamp']).dt.total_seconds()

    # 3. Map the agent's cost per hour to each event row.
    #    .map() is a highly efficient way to do this.
    #    .fillna(0) ensures that any resource not in our agent_costs dict gets a cost of 0.
    df['cost_per_hour'] = df['resource'].map(agent_costs).fillna(0)

    # 4. Calculate the final cost for each activity
    #    (Duration in seconds / 3600 seconds per hour) * cost per hour
    df['activity_cost'] = (df['duration_seconds'] / 3600) * df['cost_per_hour']
    
    # Optional: You can drop the intermediate columns if you want a cleaner output
    # df = df.drop(columns=['duration_seconds', 'cost_per_hour'])

    return df

In [76]:
log_improvements, case_improvements = comparison_across_logs()

Comparing 10 simulation logs from 'LoanApp.csv'...

--- Log 0 ---
  Baseline Avg. Cycle Time:  50275.163768165
  Optimized Avg. Cycle Time: 43461.497169015
  Improvement: 13.55%

--- Log 1 ---
  Baseline Avg. Cycle Time:  56348.631561815
  Optimized Avg. Cycle Time: 45204.58302051001
  Improvement: 19.78%

--- Log 2 ---
  Baseline Avg. Cycle Time:  54611.17538516499
  Optimized Avg. Cycle Time: 52252.299407725
  Improvement: 4.32%

--- Log 3 ---
  Baseline Avg. Cycle Time:  52161.806842785
  Optimized Avg. Cycle Time: 47716.723373835
  Improvement: 8.52%

--- Log 4 ---
  Baseline Avg. Cycle Time:  47820.89159405001
  Optimized Avg. Cycle Time: 49535.708788134994
  Improvement: -3.59%

--- Log 5 ---
  Baseline Avg. Cycle Time:  47887.91980021
  Optimized Avg. Cycle Time: 59425.925722969994
  Improvement: -24.09%

--- Log 6 ---
  Baseline Avg. Cycle Time:  52402.908601805
  Optimized Avg. Cycle Time: 45991.507329604996
  Improvement: 12.23%

--- Log 7 ---
  Baseline Avg. Cycle Time:  559

In [77]:
compararison_across_cases()

Case-by-case aggregated stats:

Case ID: 0
  Mean Baseline:  9998.62
  Mean Optimized: 10201.42
  Mean Improvement: -2.03%
  Median Baseline:  10295.61
  Median Optimized: 9025.61
  Median Improvement: 12.34%

Case ID: 1
  Mean Baseline:  10671.63
  Mean Optimized: 8128.81
  Mean Improvement: 23.83%
  Median Baseline:  10751.04
  Median Optimized: 7358.01
  Median Improvement: 31.56%

Case ID: 2
  Mean Baseline:  16381.52
  Mean Optimized: 7739.25
  Mean Improvement: 52.76%
  Median Baseline:  9643.76
  Median Optimized: 7330.25
  Median Improvement: 23.99%

Case ID: 3
  Mean Baseline:  7699.71
  Mean Optimized: 11752.98
  Mean Improvement: -52.64%
  Median Baseline:  7950.31
  Median Optimized: 10415.51
  Median Improvement: -31.01%

Case ID: 4
  Mean Baseline:  16440.36
  Mean Optimized: 10292.38
  Mean Improvement: 37.40%
  Median Baseline:  9095.94
  Median Optimized: 9168.23
  Median Improvement: -0.79%

Case ID: 5
  Mean Baseline:  17603.60
  Mean Optimized: 23614.99
  Mean Impro

In [78]:
c = 0
tc = 0
for k,v in case_improvements.items():
    if v < 0:
        print(k,v)
        c=c+1
    tc = tc + 1
print(f"Total problematic cases {c}")
print(f"Total cases {tc}")


(0, 2) -45.59035516573602
(0, 4) -39.4519405967626
(0, 7) -21.214556984761202
(0, 10) -823.4129607199534
(0, 13) -3.579961270101706
(0, 14) -1.292399111331249
(0, 15) -46.49591406230968
(0, 18) -5.321857518263155
(0, 25) -19.042201045538672
(0, 28) -1.9740006841485285
(0, 30) -0.8585997661139799
(0, 33) -48.97791608379952
(0, 35) -141.65820169175936
(0, 36) -9.506355507970284
(0, 37) -2.581617775836416
(0, 38) -0.9138539372606906
(0, 39) -8.214423508856633
(0, 40) -51.198203111659666
(0, 43) -9.435459719185408
(0, 44) -6.051202712290529
(0, 45) -5.813482279519691
(0, 50) -38.084005184821216
(0, 52) -5.932977627352541
(0, 54) -53.15390593631609
(0, 55) -2.6846648381810145
(0, 58) -3.8456074871735644
(0, 60) -3.6000309102029755
(0, 61) -9.925303454148308
(0, 62) -3.187495034244526
(0, 66) -33.797682076976734
(0, 67) -8.636727400924489
(0, 74) -5.416529449848915
(0, 75) -1322.8644599465872
(0, 77) -14.151870450383738
(0, 78) -2.5133138539500357
(0, 80) -46.82906553858561
(0, 81) -6.233999

In [79]:
log_name = 'LoanApp.csv'

base_path = os.path.join('..', '..', 'simulated_data', log_name)
baseline_dir_path = os.path.join(base_path, 'baseline')
optimized_dir_path = os.path.join(base_path, 'autonomous') # Updated path

# Check if directories exist before running
if not os.path.exists(baseline_dir_path) or not os.path.exists(optimized_dir_path):
    print(f"FATAL ERROR: Could not find directories.")
    print(f"Checked for: '{baseline_dir_path}'")
    print(f"Checked for: '{optimized_dir_path}'")
    print("Please ensure the CONFIG section is correct and you have run the simulations.")
else:
    print("--- Analysis 1: Comparing Costs on a Per-Log Basis ---")
    compare_costs_across_logs()
        
    print("\n\n--- Analysis 2: Comparing Aggregated Costs on a Per-Case Basis ---")
    compare_costs_across_cases()

--- Analysis 1: Comparing Costs on a Per-Log Basis ---
Comparing costs for 10 simulation logs from 'LoanApp.csv'...

--- Log 0 ---
  Baseline Avg. Case Cost:  $1012.94
  Optimized Avg. Case Cost: $1058.71
  Cost Reduction: -4.52%

--- Log 1 ---
  Baseline Avg. Case Cost:  $1081.75
  Optimized Avg. Case Cost: $1126.79
  Cost Reduction: -4.16%

--- Log 2 ---
  Baseline Avg. Case Cost:  $968.88
  Optimized Avg. Case Cost: $1058.36
  Cost Reduction: -9.24%

--- Log 3 ---
  Baseline Avg. Case Cost:  $1199.75
  Optimized Avg. Case Cost: $1105.98
  Cost Reduction: 7.82%

--- Log 4 ---
  Baseline Avg. Case Cost:  $923.37
  Optimized Avg. Case Cost: $1160.38
  Cost Reduction: -25.67%

--- Log 5 ---
  Baseline Avg. Case Cost:  $1142.44
  Optimized Avg. Case Cost: $1125.17
  Cost Reduction: 1.51%

--- Log 6 ---
  Baseline Avg. Case Cost:  $1151.10
  Optimized Avg. Case Cost: $989.97
  Cost Reduction: 14.00%

--- Log 7 ---
  Baseline Avg. Case Cost:  $1049.25
  Optimized Avg. Case Cost: $1105.21
 

In [80]:
def calculate_total_wait_time(df: pd.DataFrame) -> float:
    """
    Calculates the total waiting time for all cases in a log file.

    Wait time is defined as the time between the end of the previous activity
    and the start of the current activity within the same case.

    Args:
        df (pd.DataFrame): The log DataFrame.

    Returns:
        float: The total waiting time in seconds for the entire log.
    """
    if df.empty:
        return 0.0

    # Ensure timestamp columns are in datetime format
    try:
        df['start_timestamp'] = pd.to_datetime(df['start_timestamp'], format='ISO8601')
        df['end_timestamp'] = pd.to_datetime(df['end_timestamp'], format='ISO8601')
    except Exception as e:
        print(f"  Warning: Could not parse timestamps. Error: {e}")
        return 0.0

    # Sort events to ensure correct chronological order within each case
    df = df.sort_values(by=['case_id', 'start_timestamp'])

    # Get the end time of the *previous* activity within the same case
    df['previous_end_timestamp'] = df.groupby('case_id')['end_timestamp'].shift(1)

    # The first activity in a case has no previous end time, so fill with its own start time
    df['previous_end_timestamp'].fillna(df['start_timestamp'], inplace=True)

    # Calculate wait time for each activity
    df['wait_time_seconds'] = (df['start_timestamp'] - df['previous_end_timestamp']).dt.total_seconds()
    
    # Ensure wait times are not negative (can happen with parallel activities)
    df['wait_time_seconds'] = df['wait_time_seconds'].clip(lower=0)
    
    # Return the sum of all wait times
    return df['wait_time_seconds'].sum()


# ==============================================================================
# --- NEW BENCHMARKING FUNCTION ---
# ==============================================================================

def compare_wait_times_across_logs(baseline_dir, optimized_dir, LOG_NAME):
    """
    Compares total wait times between baseline and optimized simulation logs.
    """
    print(f"Comparing total wait times for {LOG_NAME}...\n")
    all_improvements = []

    try:
        baseline_files = [f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')]
        num_simulations = len(baseline_files)
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            print(f"--- Log {i} ---\n  Could not compare due to missing file(s).\n")
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except Exception as e:
            print(f"--- Log {i} ---\n  Error reading files: {e}\n")
            continue

        total_wait_base = calculate_total_wait_time(df_base)
        total_wait_opt = calculate_total_wait_time(df_opt)

        if total_wait_base > 0:
            # Improvement is a REDUCTION in wait time
            improvement_percent = ((total_wait_base - total_wait_opt) / total_wait_base) * 100
            all_improvements.append(improvement_percent)

            print(f"--- Log {i} ---")
            print(f"  Baseline Total Wait Time:  {total_wait_base / 3600:.2f} hours")
            print(f"  Optimized Total Wait Time: {total_wait_opt / 3600:.2f} hours")
            print(f"  Wait Time Reduction: {improvement_percent:.2f}%\n")
        else:
            print(f"--- Log {i} ---\n  Baseline wait time is zero, cannot calculate improvement.\n")

    if all_improvements:
        overall_avg_improvement = np.mean(all_improvements)
        print("="*40)
        print(f"Overall Average Wait Time Reduction: {overall_avg_improvement:.2f}%")
        print("="*40)

# ==============================================================================
# --- MAIN EXECUTION BLOCK ---
# ==============================================================================

In [81]:


# ==============================================================================
# --- HELPER FUNCTION ---
# ==============================================================================

def calculate_gini_coefficient(df: pd.DataFrame) -> float:
    """
    Calculates the Gini coefficient for resource workload in a log.
    A lower Gini coefficient indicates better load balancing.

    Args:
        df (pd.DataFrame): The log DataFrame.

    Returns:
        float: The Gini coefficient (between 0 and 1).
    """
    if df.empty or 'resource' not in df.columns:
        return 0.0

    # Calculate the total work duration for each resource
    try:
        df['start_timestamp'] = pd.to_datetime(df['start_timestamp'])
        df['end_timestamp'] = pd.to_datetime(df['end_timestamp'])
    except Exception:
        return 0.0 # Return neutral value if timestamps are invalid
        
    df['work_duration'] = (df['end_timestamp'] - df['start_timestamp']).dt.total_seconds()
    
    # Get total work per resource
    workload = df.groupby('resource')['work_duration'].sum()

    if workload.empty:
        return 0.0

    # Gini coefficient calculation
    # Based on the formula: G = (Σ (2i - n - 1) * x_i) / (n * Σ x_i)
    # where x is the sorted list of values, n is the count, and i is the rank.
    values = workload.sort_values().to_numpy()
    n = len(values)
    if n < 2:
        return 0.0 # Gini is 0 if there's only one resource
        
    i = np.arange(1, n + 1)
    numerator = np.sum((2 * i - n - 1) * values)
    denominator = n * np.sum(values)

    if denominator == 0:
        return 0.0

    return numerator / denominator


# ==============================================================================
# --- BENCHMARKING FUNCTIONS ---
# ==============================================================================

def compare_load_balancing_across_logs(baseline_dir, optimized_dir):
    """
    Compares the Gini coefficient for load balancing on a per-log basis.
    A lower Gini coefficient is better.
    """
    print(f"Comparing Load Balancing (Gini Coefficient) for {LOG_NAME}...\n")
    all_improvements = []

    try:
        baseline_files = [f for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')]
        num_simulations = len(baseline_files)
    except FileNotFoundError:
        print(f"Error: Baseline directory not found at {baseline_dir}")
        return

    for i in range(num_simulations):
        baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
        optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')

        if not os.path.exists(baseline_file) or not os.path.exists(optimized_file):
            print(f"--- Log {i} ---\n  Could not compare due to missing file(s).\n")
            continue

        try:
            df_base = pd.read_csv(baseline_file)
            df_opt = pd.read_csv(optimized_file)
        except Exception as e:
            print(f"--- Log {i} ---\n  Error reading files: {e}\n")
            continue

        gini_base = calculate_gini_coefficient(df_base)
        gini_opt = calculate_gini_coefficient(df_opt)

        if gini_base > 0:
            # Improvement is a REDUCTION in the Gini coefficient
            improvement_percent = ((gini_base - gini_opt) / gini_base) * 100
            all_improvements.append(improvement_percent)

            print(f"--- Log {i} ---")
            print(f"  Baseline Gini Coefficient:  {gini_base:.4f} (More unequal)")
            print(f"  Optimized Gini Coefficient: {gini_opt:.4f} (More equal)")
            print(f"  Load Balancing Improvement: {improvement_percent:.2f}%\n")
        else:
            print(f"--- Log {i} ---\n  Baseline Gini is zero, cannot calculate improvement.\n")

    if all_improvements:
        overall_avg_improvement = np.mean(all_improvements)
        print("="*50)
        print(f"Overall Average Improvement in Load Balancing: {overall_avg_improvement:.2f}%")
        print("="*50)


def compare_load_balancing_across_all_data(baseline_dir, optimized_dir):
    """
    Aggregates all logs and compares the overall Gini coefficient.
    This gives a single, stable measure of load balancing across the entire simulation set.
    """
    print(f"Comparing Overall Load Balancing by Aggregating All Logs...\n")
    
    # Load and concatenate all logs from each directory
    all_baseline_dfs = [pd.read_csv(os.path.join(baseline_dir, f)) for f in os.listdir(baseline_dir) if f.startswith('simulated_log_')]
    all_optimized_dfs = [pd.read_csv(os.path.join(optimized_dir, f)) for f in os.listdir(optimized_dir) if f.startswith('simulated_log_')]

    if not all_baseline_dfs or not all_optimized_dfs:
        print("Error: Could not load one or both sets of logs.")
        return

    # Create two massive DataFrames, one for baseline and one for optimized
    df_base_total = pd.concat(all_baseline_dfs, ignore_index=True)
    df_opt_total = pd.concat(all_optimized_dfs, ignore_index=True)

    # Calculate the Gini coefficient for the entire aggregated dataset
    gini_base_total = calculate_gini_coefficient(df_base_total)
    gini_opt_total = calculate_gini_coefficient(df_opt_total)

    if gini_base_total > 0:
        improvement_percent = ((gini_base_total - gini_opt_total) / gini_base_total) * 100
        
        print("--- Overall Aggregated Results ---")
        print(f"  Baseline Gini Coefficient (All Logs):  {gini_base_total:.4f}")
        print(f"  Optimized Gini Coefficient (All Logs): {gini_opt_total:.4f}")
        print(f"  Total Load Balancing Improvement: {improvement_percent:.2f}%\n")
    else:
        print("Could not calculate overall improvement as baseline Gini is zero.")


try:
    log_name = 'LoanApp.csv' 
    base_path = os.path.join('..', '..', 'simulated_data', log_name)
    baseline_dir_path = os.path.join(base_path, 'baseline')
    optimized_dir_path = os.path.join(base_path, 'autonomous')

    if not os.path.exists(baseline_dir_path) or not os.path.exists(optimized_dir_path):
        raise FileNotFoundError

    print("### Analysis 1: Per-Log Load Balancing Comparison ###\n")
    compare_load_balancing_across_logs(baseline_dir_path, optimized_dir_path)
        
    print("\n\n### Analysis 2: Overall Aggregated Load Balancing ###\n")
    compare_load_balancing_across_all_data(baseline_dir_path, optimized_dir_path)
except FileNotFoundError:
    print(f"FATAL ERROR: Could not find directories.")
    print(f"Checked for: '{baseline_dir_path}'")
    print(f"Checked for: '{optimized_dir_path}'")
    print("Please ensure the CONFIG section is correct and you have run the simulations.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### Analysis 1: Per-Log Load Balancing Comparison ###

Comparing Load Balancing (Gini Coefficient) for LoanApp.csv...

--- Log 0 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 1 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 2 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 3 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 4 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 5 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 6 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 7 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 8 ---
  Baseline Gini is zero, cannot calculate improvement.

--- Log 9 ---
  Baseline Gini is zero, cannot calculate improvement.



### Analysis 2: Overall Aggregated Load Balancing ###

Comparing Overall Load Balancing by Aggregating All Logs...

Could not calculate overall improvement as baseline Gini is zer

In [82]:
log_name = 'LoanApp.csv' 
base_path = os.path.join('..', '..', 'simulated_data', log_name)
baseline_dir_path = os.path.join(base_path, 'baseline')
optimized_dir_path = os.path.join(base_path, 'autonomous')

all_baseline_dfs = [pd.read_csv(os.path.join(baseline_dir_path, f)) for f in os.listdir(baseline_dir_path) if f.startswith('simulated_log_')]
all_optimized_dfs = [pd.read_csv(os.path.join(optimized_dir_path, f)) for f in os.listdir(optimized_dir_path) if f.startswith('simulated_log_')] 

for i in range(0,10):
    print(f"Log {i}")
    print(f"Baseline agents used: {all_baseline_dfs[i]['agent'].nunique()}")
    print(f"Optimized agents used: {all_optimized_dfs[i]['agent'].nunique()}")
    print()


Log 0
Baseline agents used: 19
Optimized agents used: 18

Log 1
Baseline agents used: 19
Optimized agents used: 18

Log 2
Baseline agents used: 19
Optimized agents used: 18

Log 3
Baseline agents used: 19
Optimized agents used: 18

Log 4
Baseline agents used: 19
Optimized agents used: 18

Log 5
Baseline agents used: 19
Optimized agents used: 18

Log 6
Baseline agents used: 19
Optimized agents used: 18

Log 7
Baseline agents used: 19
Optimized agents used: 18

Log 8
Baseline agents used: 19
Optimized agents used: 18

Log 9
Baseline agents used: 19
Optimized agents used: 18



In [83]:
all_baseline_dfs[0]

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource
0,0,0,Check application form completeness,2023-04-20 08:00:00+00:00,2023-04-20 08:31:15.767146327+00:00,0,Clerk-000001
1,1,1,Check application form completeness,2023-04-20 08:30:00+00:00,2023-04-20 08:39:04.998861004+00:00,1,Clerk-000002
2,0,2,AML check,2023-04-20 08:31:15.767146327+00:00,2023-04-20 09:21:27.617434504+00:00,2,AML Investigator-000001
3,1,5,Return application back to applicant,2023-04-20 08:39:04.998861004+00:00,2023-04-20 08:44:04.998861004+00:00,3,Clerk-000004
4,2,4,Check application form completeness,2023-04-20 09:00:00+00:00,2023-04-20 09:15:36.685940850+00:00,4,Clerk-000003
...,...,...,...,...,...,...,...
1491,199,13,Assess loan risk,2023-05-09 08:18:07.242683294+00:00,2023-05-09 08:38:07.242683294+00:00,1606,Loan Officer-000002
1492,198,0,Reject application,2023-05-08 14:15:20.135228054+00:00,2023-05-08 14:25:20.135228054+00:00,1607,Clerk-000001
1493,199,16,Design loan offer,2023-05-09 08:38:07.242683294+00:00,2023-05-09 08:50:11.867087651+00:00,1608,Loan Officer-000004
1494,199,17,Approve loan offer,2023-05-09 08:50:11.867087651+00:00,2023-05-09 09:10:11.867087651+00:00,1609,Senior Officer-000002


In [84]:
log_name = 'LoanApp.csv' 
base_path = os.path.join('..', '..', 'simulated_data', log_name)
baseline_dir_path = os.path.join(base_path, 'baseline')
optimized_dir_path = os.path.join(base_path, 'autonomous')


    
base_path = os.path.join('..', '..', 'simulated_data', log_name)
optimized_dir = os.path.join(base_path, 'autonomous')

    # Check if directories exist before running
if not os.path.exists(baseline_dir_path) or not os.path.exists(optimized_dir_path):
    print(f"FATAL ERROR: Could not find directories.")
    print(f"Checked for: '{baseline_dir_path}'")
    print(f"Checked for: '{optimized_dir_path}'")
    print("Please ensure the CONFIG section is correct and you have run the simulations.")
else:
    compare_wait_times_across_logs(baseline_dir_path, optimized_dir_path, 'LoanApp.csv')

Comparing total wait times for LoanApp.csv...

--- Log 0 ---
  Baseline Total Wait Time:  717.40 hours
  Optimized Total Wait Time: 298.40 hours
  Wait Time Reduction: 58.41%

--- Log 1 ---
  Baseline Total Wait Time:  818.01 hours
  Optimized Total Wait Time: 362.38 hours
  Wait Time Reduction: 55.70%

--- Log 2 ---
  Baseline Total Wait Time:  1012.35 hours
  Optimized Total Wait Time: 794.54 hours
  Wait Time Reduction: 21.52%

--- Log 3 ---
  Baseline Total Wait Time:  470.36 hours
  Optimized Total Wait Time: 443.66 hours
  Wait Time Reduction: 5.68%

--- Log 4 ---
  Baseline Total Wait Time:  601.06 hours
  Optimized Total Wait Time: 507.20 hours
  Wait Time Reduction: 15.62%

--- Log 5 ---
  Baseline Total Wait Time:  295.17 hours
  Optimized Total Wait Time: 1112.60 hours
  Wait Time Reduction: -276.93%

--- Log 6 ---
  Baseline Total Wait Time:  494.56 hours
  Optimized Total Wait Time: 491.37 hours
  Wait Time Reduction: 0.65%

--- Log 7 ---
  Baseline Total Wait Time:  899.5

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['previous_end_timestamp'].fillna(df['start_timestamp'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['previous_end_timestamp'].fillna(df['start_timestamp'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because th

In [61]:
log_name = 'LoanApp.csv' 
    
base_path = os.path.join('..', '..', 'simulated_data', log_name)
optimized_dir = os.path.join(base_path, 'autonomous')


optimized_logs= []
for i in range(10):
    optimized_file = os.path.join(optimized_dir, f'simulated_log_{i}.csv')
    optimized_df = pd.read_csv(optimized_file)
    optimized_logs.append(optimized_df)

baseline_dir = os.path.join(base_path, 'baseline')


baseline_logs= []
for i in range(10):
    baseline_file = os.path.join(baseline_dir, f'simulated_log_{i}.csv')
    baseline_df = pd.read_csv(baseline_file)
    baseline_logs.append(baseline_df)

In [62]:
optimized_logs_0_cost = add_activity_cost_column(optimized_logs[0], AGENT_COSTS)
baseline_logs_0_cost = add_activity_cost_column(baseline_logs[0], AGENT_COSTS)

In [506]:
optimized_logs_0_cost[optimized_logs_0_cost['case_id']==50]

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource,duration_seconds,cost_per_hour,activity_cost
346,50,9,Check application form completeness,2023-04-25 09:00:00+00:00,2023-04-25 09:03:54.726464335+00:00,384,Clerk-000006,234.726464,90,5.868162
348,50,3,Appraise property,2023-04-25 09:03:54.726464335+00:00,2023-04-25 09:25:32.309296008+00:00,386,Appraiser-000001,1297.582832,90,32.439571
352,50,0,Check credit history,2023-04-25 09:25:32.309296008+00:00,2023-04-25 09:26:02.540782713+00:00,390,Clerk-000001,30.231487,30,0.251929
357,50,7,AML check,2023-04-25 09:26:02.540782713+00:00,2023-04-25 09:39:15.453701600+00:00,395,AML Investigator-000002,792.912919,110,24.227895
363,50,13,Assess loan risk,2023-04-25 09:39:15.453701600+00:00,2023-04-25 09:59:15.453701600+00:00,401,Loan Officer-000002,1200.0,95,31.666667
368,50,13,Design loan offer,2023-04-25 09:59:15.453701600+00:00,2023-04-25 10:03:54.191550507+00:00,406,Loan Officer-000002,278.737849,95,7.355582
378,50,14,Approve loan offer,2023-04-25 10:03:54.191550507+00:00,2023-04-25 10:23:54.191550507+00:00,416,Senior Officer-000001,1200.0,150,50.0
385,50,10,Cancel application,2023-04-25 10:23:54.191550507+00:00,2023-04-25 10:28:54.191550507+00:00,423,Clerk-000007,300.0,30,2.5


In [507]:
baseline_logs_0_cost[baseline_logs_0_cost['case_id']==194]

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource,duration_seconds,cost_per_hour,activity_cost
1429,194,4,Check application form completeness,2023-05-08 09:00:00+00:00,2023-05-08 09:02:46.537850782+00:00,1512,Clerk-000003,166.537851,60,2.775631
1432,194,7,AML check,2023-05-08 09:02:46.537850782+00:00,2023-05-08 09:04:51.624568721+00:00,1515,AML Investigator-000002,125.086718,110,3.822094
1438,194,0,Check credit history,2023-05-08 09:04:51.624568721+00:00,2023-05-08 09:25:23.351810765+00:00,1521,Clerk-000001,1231.727242,30,10.264394
1443,194,6,Appraise property,2023-05-08 10:11:34.354388136+00:00,2023-05-08 11:09:57.530088552+00:00,1526,Appraiser-000002,3503.1757,90,87.579393
1453,194,11,Assess loan risk,2023-05-08 11:09:57.530088552+00:00,2023-05-08 11:29:57.530088552+00:00,1536,Loan Officer-000001,1200.0,95,31.666667
1457,194,13,Design loan offer,2023-05-08 11:29:57.530088552+00:00,2023-05-08 11:30:28.916392668+00:00,1540,Loan Officer-000002,31.386304,95,0.82825
1460,194,17,Approve loan offer,2023-05-08 11:30:28.916392668+00:00,2023-05-08 11:50:28.916392668+00:00,1544,Senior Officer-000002,1200.0,150,50.0
1465,194,1,Approve application,2023-05-08 11:50:28.916392668+00:00,2023-05-08 11:55:28.916392668+00:00,1551,Clerk-000002,300.0,30,2.5


In [45]:
baseline_logs_0_cost[baseline_logs_0_cost['resource']=='Clerk-000001']

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource,duration_seconds,cost_per_hour,activity_cost
0,0,0,Check application form completeness,2023-04-20 08:00:00+00:00,2023-04-20 08:43:37.614829726+00:00,0,Clerk-000001,2617.614830,30,21.813457
18,4,0,Check credit history,2023-04-20 10:02:09.387286291+00:00,2023-04-20 10:08:41.646032284+00:00,18,Clerk-000001,392.258746,30,3.268823
24,5,0,Check credit history,2023-04-20 10:55:01.152356712+00:00,2023-04-20 11:16:40.415409532+00:00,24,Clerk-000001,1299.263053,30,10.827192
42,7,0,Check credit history,2023-04-20 12:24:53.147598732+00:00,2023-04-20 12:41:42.314751443+00:00,42,Clerk-000001,1009.167153,30,8.409726
44,4,0,Reject application,2023-04-20 11:09:12.020939566+00:00,2023-04-20 11:19:12.020939566+00:00,44,Clerk-000001,600.000000,30,5.000000
...,...,...,...,...,...,...,...,...,...,...
1423,193,0,Check application form completeness,2023-05-08 08:30:00+00:00,2023-05-08 08:33:58.925055555+00:00,1506,Clerk-000001,238.925056,30,1.991042
1438,194,0,Check credit history,2023-05-08 09:04:51.624568721+00:00,2023-05-08 09:25:23.351810765+00:00,1521,Clerk-000001,1231.727242,30,10.264394
1441,195,0,Check credit history,2023-05-08 10:04:03.163355210+00:00,2023-05-08 10:51:55.303721034+00:00,1524,Clerk-000001,2872.140366,30,23.934503
1455,192,0,Reject application,2023-05-08 12:21:48.645351867+00:00,2023-05-08 12:31:48.645351867+00:00,1538,Clerk-000001,600.000000,30,5.000000


In [46]:
baseline_logs_0_cost[baseline_logs_0_cost['resource']=='Clerk-000002']

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource,duration_seconds,cost_per_hour,activity_cost
1,1,1,Check application form completeness,2023-04-20 08:30:00+00:00,2023-04-20 08:55:49.298830341+00:00,1,Clerk-000002,1549.298830,30,12.910824
12,2,1,Check credit history,2023-04-20 09:33:32.006788136+00:00,2023-04-20 09:44:39.466867740+00:00,12,Clerk-000002,667.460080,30,5.562167
32,7,1,Check application form completeness,2023-04-20 11:30:00+00:00,2023-04-20 11:42:44.909816420+00:00,32,Clerk-000002,764.909816,30,6.374248
66,12,1,Check application form completeness,2023-04-20 14:00:00+00:00,2023-04-20 14:20:28.951480734+00:00,66,Clerk-000002,1228.951481,30,10.241262
85,16,1,Check application form completeness,2023-04-21 08:00:00+00:00,2023-04-21 08:08:50.980187505+00:00,85,Clerk-000002,530.980188,30,4.424835
...,...,...,...,...,...,...,...,...,...,...
1407,191,1,Check credit history,2023-05-08 07:45:22.644238317+00:00,2023-05-08 07:58:54.831447354+00:00,1489,Clerk-000002,812.187209,30,6.768227
1421,188,1,Reject application,2023-05-08 08:14:58.688161850+00:00,2023-05-08 08:24:58.688161850+00:00,1504,Clerk-000002,600.000000,30,5.000000
1440,196,1,Check application form completeness,2023-05-08 10:00:00+00:00,2023-05-08 10:05:48.257170446+00:00,1523,Clerk-000002,348.257170,30,2.902143
1449,193,1,Approve application,2023-05-08 09:58:14.365056257+00:00,2023-05-08 10:03:14.365056257+00:00,1532,Clerk-000002,300.000000,30,2.500000


In [47]:
baseline_logs_0_cost[baseline_logs_0_cost['resource']=='Clerk-000003']

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource,duration_seconds,cost_per_hour,activity_cost
4,0,4,Check credit history,2023-04-20 08:59:03.365612694+00:00,2023-04-20 09:35:30.082000453+00:00,4,Clerk-000003,2186.716388,60,36.445273
13,3,4,Check credit history,2023-04-20 09:33:38.514312029+00:00,2023-04-20 10:26:15.079206372+00:00,13,Clerk-000003,3156.564894,60,52.609415
31,6,4,Check credit history,2023-04-20 11:17:30.059813396+00:00,2023-04-20 11:30:42.615563646+00:00,31,Clerk-000003,792.555750,60,13.209263
56,10,4,Check credit history,2023-04-20 13:02:51.964511128+00:00,2023-04-20 13:11:48.704326890+00:00,56,Clerk-000003,536.739816,60,8.945664
62,8,4,Reject application,2023-04-20 13:12:48.746063072+00:00,2023-04-20 13:22:48.746063072+00:00,62,Clerk-000003,600.000000,60,10.000000
...,...,...,...,...,...,...,...,...,...,...
1365,184,4,Check credit history,2023-05-05 12:46:56.162499494+00:00,2023-05-05 12:50:31.211560972+00:00,1443,Clerk-000003,215.049061,60,3.584151
1398,191,4,Check application form completeness,2023-05-08 07:30:00+00:00,2023-05-08 07:39:25.752899560+00:00,1478,Clerk-000003,565.752900,60,9.429215
1403,184,4,Approve application,2023-05-05 13:36:38.559937620+00:00,2023-05-05 13:41:38.559937620+00:00,1484,Clerk-000003,300.000000,60,5.000000
1429,194,4,Check application form completeness,2023-05-08 09:00:00+00:00,2023-05-08 09:02:46.537850782+00:00,1512,Clerk-000003,166.537851,60,2.775631
