In [62]:
import pandas as pd

# Replace with your actual file path
df = pd.read_csv("LoanApp.csv.gz", compression='gzip')

# Print column names
print(df.columns)

Index(['case_id', 'resource', 'activity', 'start_time', 'end_time'], dtype='object')


In [93]:
list_of_all_end_activities = []
for case_id in range(0,200):
    # print(case_id, simulated[simulated['case_id'] == case_id].tail(1)['activity_name'].values[0])
    list_of_all_end_activities.append(df[df['case_id'] == case_id].tail(1)['activity'].values[0])
list_of_all_end_activities = list(set(list_of_all_end_activities))

In [67]:
list_of_all_end_activities

['Cancel application', 'Reject application', 'Approve application']

In [92]:
for i in range(0,1000):
    case_rows = df[df['case_id'] == i]
    if len(case_rows)==16:
        print(case_rows)
        print("----")

Get Strict Pre-requisites 

In [204]:
from collections import defaultdict

In [205]:
def discover_strict_prerequisites_from_log(df, activity_col='activity', case_col='case_id', order_by='end_timestamp'):
    # Step 1: Collect all activities that appear before each activity in each case
    activity_to_preceding_sets = defaultdict(list)

    for case_id, group in df.groupby(case_col):
        sorted_activities = group.sort_values(by=order_by)[activity_col].tolist()
        seen = set()
        for i, act in enumerate(sorted_activities):
            activity_to_preceding_sets[act].append(seen.copy())
            seen.add(act)

    # Step 2: Intersect the "seen-before" sets across all cases
    raw_prerequisites = {}
    for act, preceding_sets in activity_to_preceding_sets.items():
        if preceding_sets:
            raw_prerequisites[act] = set.intersection(*preceding_sets)
        else:
            raw_prerequisites[act] = set()

    # Step 3: Remove transitive dependencies
    # If A → B and B → C, remove A from prerequisites of C
    def remove_transitive(prereq_dict):
        cleaned = {}
        for act in prereq_dict:
            direct_prereqs = prereq_dict[act].copy()
            # Remove any indirect dependencies
            for p in direct_prereqs.copy():
                indirects = prereq_dict.get(p, set())
                direct_prereqs -= indirects
            cleaned[act] = list(direct_prereqs)
        return cleaned

    strict_prerequisites = remove_transitive(raw_prerequisites)
    return strict_prerequisites


In [None]:
# load logs


strict_prereqs = discover_strict_prerequisites_from_log(df)
for act, pre in strict_prereqs.items():
    print(f"{act}: {pre}")

Test: get pre-requisites and paths from raw logs

In [95]:
df['activity'].unique()

array(['Check application form completeness', 'Check credit history',
       'Appraise property', 'AML check', 'Assess loan risk',
       'Design loan offer', 'Approve loan offer', 'Cancel application',
       'Approve application', 'Reject application',
       'Return application back to applicant', 'Applicant completes form'],
      dtype=object)

In [110]:
list_of_all_end_activities = []
first_activities = []
for case_id in range(0,200):
    first_activity = df[df['case_id'] == case_id].iloc[0]['activity']
    first_activities.append(first_activity)
first_activities = list(set(first_activities))

In [112]:
first_activities 
# no pre-requisites for 'Check application form completeness'

['Check application form completeness']

In [126]:
list_of_all_end_activities = []
second_activities = []
for case_id in range(0,200):
    second_activity = df[df['case_id'] == case_id].iloc[1]['activity']
    second_activities.append(second_activity)
second_activities = list(set(second_activities))

In [127]:
second_activities #  pre-requisite only 'Check application form completeness'

['Check credit history',
 'Appraise property',
 'Return application back to applicant',
 'AML check']

In [None]:
list_of_all_end_activities = []
third_activities = []
potential_prerequisites = []
for case_id in range(0,200):
    third_activity = df[df['case_id'] == case_id].iloc[2]['activity']
    if third_activity == 'Applicant completes form':
        potential_prerequisites.append(df[df['case_id'] == case_id].iloc[1:2]['activity'].values[0])
    third_activities.append(third_activity)
third_activities = list(set(third_activities))
print(third_activities)
potential_prerequisites = list(set(potential_prerequisites))
print(potential_prerequisites)  # only pre-requisite for 'Applicant completes form' is 'Return application back to applicant'


['AML check', 'Appraise property', 'Applicant completes form', 'Check credit history']
['Return application back to applicant']


In [None]:
list_of_all_end_activities = []
fifth_activities = []
potential_prerequisites = []
for case_id in range(0,200):
    fifth_activity = df[df['case_id'] == case_id].iloc[4]['activity']
    fifth_activities.append(fifth_activity)
    if fifth_activity == 'Assess loan risk':
        # print(df[df['case_id'] == case_id].iloc[1:3]['activity'].values)
        potential_prerequisites.append(df[df['case_id'] == case_id].iloc[1:4]['activity'].values.tolist())
fifth_activities = list(set(fifth_activities))
print(fifth_activities)
# potential_prerequisites = list(set(potential_prerequisites))
potential_prerequisites = {tuple(sorted(pair)) for pair in potential_prerequisites}

print(potential_prerequisites)
# prerequisites of 'Assess loan risk': ('AML check', 'Appraise property', 'Check credit history')

['AML check', 'Assess loan risk', 'Check credit history', 'Appraise property', 'Return application back to applicant']
{('AML check', 'Appraise property', 'Check credit history')}


In [200]:
list_of_all_end_activities = []
sixth_activities = []
potential_prerequisites_design = []
potential_prerequisites_reject = []
for case_id in range(0,200):
    sixth_activity = df[df['case_id'] == case_id].iloc[5]['activity']
    sixth_activities.append(sixth_activity)
    if sixth_activity == 'Design loan offer':
        potential_prerequisites_design.append(df[df['case_id'] == case_id].iloc[1:5]['activity'].values.tolist())
    if sixth_activity == 'Reject application':
        potential_prerequisites_reject.append(df[df['case_id'] == case_id].iloc[1:5]['activity'].values.tolist())
sixth_activities = list(set(sixth_activities))
print(sixth_activities)

potential_prerequisites_design = {tuple(sorted(triple)) for triple in potential_prerequisites_design}
potential_prerequisites_design = [list(triple) for triple in potential_prerequisites_design]
potential_prerequisites_reject = {tuple(sorted(triple)) for triple in potential_prerequisites_reject}
potential_prerequisites_reject = [list(triple) for triple in potential_prerequisites_reject]
print(potential_prerequisites_design)
# prerequisites of 'Design loan offer' and 'Reject application': Assess loan risk


['AML check', 'Design loan offer', 'Reject application', 'Check credit history', 'Appraise property', 'Applicant completes form']
[['AML check', 'Appraise property', 'Assess loan risk', 'Check credit history']]


In [196]:
df[df['case_id'] == 0]


Unnamed: 0,case_id,resource,activity,start_time,end_time
0,0,Clerk-000001,Check application form completeness,2023-02-09T08:00:00.000,2023-02-09T08:28:27.808
2,0,Clerk-000002,Check credit history,2023-02-09T08:28:27.808,2023-02-09T09:00:01.030
7,0,AML Investigator-000001,AML check,2023-02-09T08:28:27.808,2023-02-09T09:25:12.258
8,0,Appraiser-000001,Appraise property,2023-02-09T08:28:27.808,2023-02-09T09:35:08.311
12,0,Loan Officer-000001,Assess loan risk,2023-02-09T09:35:08.311,2023-02-09T09:55:08.311
13,0,Loan Officer-000002,Design loan offer,2023-02-09T09:55:08.311,2023-02-09T10:01:45.244
15,0,Senior Officer-000001,Approve loan offer,2023-02-09T10:01:45.244,2023-02-09T10:21:45.244
16,0,Clerk-000003,Cancel application,2023-02-09T10:21:45.244,2023-02-09T10:26:45.244


In [None]:
list_of_all_end_activities = []
seventh_activities = []
potential_prerequisites = []
for case_id in range(0,200):
    if len(df[df['case_id'] == case_id])>6:
        seventh_activity = df[df['case_id'] == case_id].iloc[6]['activity']
        seventh_activities.append(seventh_activity)
        if seventh_activity == 'Approve loan offer':
            potential_prerequisites.append(df[df['case_id'] == case_id].iloc[1:6]['activity'].values.tolist())
            # if df[df['case_id'] == case_id].iloc[1:5]['activity'].values.tolist()[-1] != 'Assess loan risk':
            #     print(df[df['case_id'] == case_id].iloc[1:5]['activity'].values.tolist()[-1])
       
seventh_activities = list(set(seventh_activities))
# print(seventh_activities)
potential_prerequisites = {tuple(sorted(triple)) for triple in potential_prerequisites}
potential_prerequisites = [list(triple) for triple in potential_prerequisites]
print(potential_prerequisites)

# prerequisites of 'Approve loan offer': Design loan offer

[['AML check', 'Appraise property', 'Assess loan risk', 'Check credit history', 'Design loan offer']]


In [None]:
list_of_all_end_activities = []
eight_activities = []
potential_prerequisites = []
for case_id in range(0,200):
    if len(df[df['case_id'] == case_id])>7:
        eight_activity = df[df['case_id'] == case_id].iloc[7]['activity']
        eight_activities.append(eight_activity)
        if eight_activity == 'Cancel application':
            potential_prerequisites.append(df[df['case_id'] == case_id].iloc[1:7]['activity'].values.tolist())
            # if df[df['case_id'] == case_id].iloc[1:5]['activity'].values.tolist()[-1] != 'Assess loan risk':
            #     print(df[df['case_id'] == case_id].iloc[1:5]['activity'].values.tolist()[-1])
       
eight_activities = list(set(eight_activities))
print(eight_activities)
potential_prerequisites = {tuple(sorted(triple)) for triple in potential_prerequisites}
potential_prerequisites = [list(triple) for triple in potential_prerequisites]
print(potential_prerequisites)
# prerequisites of 'Cancel application': 'Approve loan offer'

['Assess loan risk', 'Check credit history', 'Cancel application', 'Appraise property', 'Approve application']
[['AML check', 'Appraise property', 'Approve loan offer', 'Assess loan risk', 'Check credit history', 'Design loan offer']]


In [None]:

potential_prerequisites = []
for case_id in range(0,200):
    if not df.loc[(df['case_id'] == case_id) & (df['activity'] == 'Return application back to applicant')].empty:
        print(df.loc[(df['case_id'] == case_id)])

# prerequisites of 'Return application back to applicant': ['Check application form completeness']

     case_id                 resource                              activity  \
103       13             Clerk-000007   Check application form completeness   
104       13             Clerk-000005  Return application back to applicant   
116       13         Applicant-000001              Applicant completes form   
118       13             Clerk-000007   Check application form completeness   
120       13             Clerk-000004                  Check credit history   
126       13  AML Investigator-000002                             AML check   
128       13         Appraiser-000002                     Appraise property   
135       13      Loan Officer-000004                      Assess loan risk   
136       13      Loan Officer-000001                     Design loan offer   
142       13    Senior Officer-000002                    Approve loan offer   
144       13             Clerk-000003                   Approve application   

                  start_time                 end_ti

Generated log checking

In [21]:
simulated = pd.read_csv("../simulated_data/LoanApp.csv/autonomous/simulated_log_0.csv")


In [59]:
list_of_all_end_activities = []
for case_id in range(0,200):
    # print(case_id, simulated[simulated['case_id'] == case_id].tail(1)['activity_name'].values[0])
    list_of_all_end_activities.append(simulated[simulated['case_id'] == case_id].tail(1)['activity_name'].values[0])
list_of_all_end_activities = list(set(list_of_all_end_activities))

In [61]:
list_of_all_end_activities

['AML check',
 'Check application form completeness',
 'Approve loan offer',
 'Reject application',
 'Design loan offer',
 'Assess loan risk',
 'Check credit history',
 'Cancel application',
 'Appraise property',
 'Return application back to applicant',
 'Approve application',
 'Applicant completes form']

In [46]:
l

[874    Cancel application
 Name: activity_name, dtype: object,
 874    Cancel application
 Name: activity_name, dtype: object,
 array(['Cancel application'], dtype=object),
 'Cancel application']

In [32]:
simulated[simulated['case_id'] == 0]

Unnamed: 0,case_id,agent,activity_name,start_timestamp,end_timestamp,TimeStep,resource
0,0,0,Check application form completeness,2023-04-20 08:00:00+00:00,2023-04-20 08:16:26.305513912+00:00,0,Clerk-000001
1,0,9,Return application back to applicant,2023-04-20 08:16:26.305513912+00:00,2023-04-20 08:21:26.305513912+00:00,1,Clerk-000006
2,0,1,Reject application,2023-04-20 08:21:26.305513912+00:00,2023-04-20 08:31:26.305513912+00:00,2,Clerk-000002
4,0,5,Return application back to applicant,2023-04-20 08:31:26.305513912+00:00,2023-04-20 08:36:26.305513912+00:00,4,Clerk-000004
5,0,3,Appraise property,2023-04-20 08:36:26.305513912+00:00,2023-04-20 08:50:30.639385526+00:00,5,Appraiser-000001
7,0,10,Return application back to applicant,2023-04-20 08:50:30.639385526+00:00,2023-04-20 08:55:30.639385526+00:00,7,Clerk-000007
10,0,14,Approve loan offer,2023-04-20 08:55:30.639385526+00:00,2023-04-20 09:15:30.639385526+00:00,10,Senior Officer-000001
22,0,4,Approve application,2023-04-20 10:44:47.228181439+00:00,2023-04-20 10:49:47.228181439+00:00,23,Clerk-000003
28,0,11,Assess loan risk,2023-04-20 10:49:47.228181439+00:00,2023-04-20 11:09:47.228181439+00:00,29,Loan Officer-000001
36,0,9,Approve application,2023-04-20 11:09:47.228181439+00:00,2023-04-20 11:14:47.228181439+00:00,37,Clerk-000006
