In [28]:
import pandas as pd
# import pm4py
from tqdm import tqdm
import os
import numpy as np
import math

# Importing data

In [31]:
# declined_train = pd.read_csv('../data/declined_train.csv')
declined_test = pd.read_csv('../data/declined_test.csv', index_col=0).reset_index(drop=True)
print(len(declined_test['case:concept:name'].unique()))

declined_train = pd.read_csv('../data/declined_train.csv', index_col=0).reset_index(drop=True)
print(len(declined_train['case:concept:name'].unique()))

712
3001


In [32]:
from random import sample
approved_train = pd.read_csv('../data/approved_train.csv', index_col=0).reset_index(drop=True)
approved_train = approved_train[approved_train['case:concept:name'].isin(sample(list(approved_train['case:concept:name'].unique()), 3000))]

approved_test = pd.read_csv('../data/approved_test.csv', index_col=0).reset_index(drop=True)
approved_test = approved_test[approved_test['case:concept:name'].isin(sample(list(approved_test['case:concept:name'].unique()), 700))]



cancelled_train = pd.read_csv('../data/cancelled_train.csv', index_col=0).reset_index(drop=True)
cancelled_train = cancelled_train[cancelled_train['case:concept:name'].isin(sample(list(cancelled_train['case:concept:name'].unique()), 3000))]

cancelled_test = pd.read_csv('../data/cancelled_test.csv', index_col=0).reset_index(drop=True)
cancelled_test = cancelled_test[cancelled_test['case:concept:name'].isin(sample(list(cancelled_test['case:concept:name'].unique()), 700))]



In [6]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html#reshaping-dummies to create the the categorical encoding
# https://stackoverflow.com/questions/49161120/pandas-python-set-value-of-one-column-based-on-value-in-another-column to assign values of the result DF
# https://stackoverflow.com/questions/71426679/cumulative-sum-of-time-from-timestamps-in-pandas for cumulative time for boris

# Define preprocessing

In [35]:
train = pd.concat([approved_train, declined_train, cancelled_train])
train['event_w_lifecycle'] = train['concept:name'] + ' ' + train['lifecycle:transition']


org_resource_cols = list(train['org:resource'].unique())
event_cols = list(train['event_w_lifecycle'].unique())
rest_cols = ['FirstWithdrawalAmount',
       'NumberOfTerms', 'Accepted', 'MonthlyCost', 'Selected', 'CreditScore',
       'OfferedAmount', 'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount']
target_cols = ['first_timestamp', 'trace_duration', 'case_outcome', 'case_progression']

res_cols = org_resource_cols + event_cols + rest_cols + target_cols


In [40]:
def aggregate_df(df, res_cols):
    """ 
    Aggregate the df of current events in the case

    Output: 
        result -> could be a pandas series
    """
    res_dict = dict.fromkeys(res_cols)
    
    for row in df.to_dict('records'):

        # user variable assign
        user = row['org:resource']
        # event variable assign
        event = row['concept:name'] + ' ' + row['lifecycle:transition']

        try:
            # first mention, turn it numeric
            if not res_dict[user]: res_dict[user] = 1
            else: res_dict[user] += 1

            if not res_dict[event]: res_dict[event] = 1
            else: res_dict[event] += 1
        except:
            # when test set has a unique resource (user) or event not in train set
            pass

        # case level cols
        rest_cols_1 = ['FirstWithdrawalAmount',
       'NumberOfTerms', 'Accepted', 'MonthlyCost', 'Selected', 'CreditScore',
       'OfferedAmount'] 

        for col in rest_cols_1: # always get the newest record
            if not math.isnan(row[col]) and row[col] != res_dict[col]:
                res_dict[col] = row[col]


        
        # time stuff
        #keeping the first timestamp of the case for calculation purposes
        if not res_dict['first_timestamp']: res_dict['first_timestamp'] = row['time:timestamp']


    rest_cols_2 = ['case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount']
    for col in rest_cols_2:
          res_dict[col] = row[col]  
    
    res_dict['case_progression'] = 0
    
    # trace duration in seconds
    res_dict['trace_duration'] = (np.datetime64(row['time:timestamp']) - np.datetime64(res_dict['first_timestamp'])).item().total_seconds()

    return res_dict



In [38]:
def add_to_aggregate(result, df_row):
    """ 
    When a new event happens, add the event info to the current aggregated result.

    Input: 
         result: the current aggregated result
         df_row: pandas df row representing the new event
    Output:
        result: the new aggregated result
    """
    last_row = result[-1].copy()
    new_event = df_row.to_dict()
    new_row = last_row

    user = new_event['org:resource']
    # event variable assign
    event = new_event['concept:name'] + ' ' + new_event['lifecycle:transition']

    try:
        # first mention, turn it numeric
        if not new_row[user]: new_row[user] = 1
        else: new_row[user] += 1

        if not new_row[event]: new_row[event] = 1
        else: new_row[event] += 1
    except:
        # when test set has a unique resource (user) or event not in train set
        pass

    # case level cols
    rest_cols_1 = ['FirstWithdrawalAmount',
    'NumberOfTerms', 'Accepted', 'MonthlyCost', 'Selected', 'CreditScore',
    'OfferedAmount'] 

    for col in rest_cols_1: # always get the newest record
        if not math.isnan(new_event[col]) and new_event[col] != new_row[col]:
            new_row[col] = new_event[col]
    
    rest_cols_2 = ['case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount']
    for col in rest_cols_2:
          new_row[col] = new_event[col]  


    # time stuff
    # trace duration in seconds
    new_row['trace_duration'] = (np.datetime64(new_event['time:timestamp']) - np.datetime64(new_row['first_timestamp'])).item().total_seconds()

    return new_row

In [39]:
def create_prefix_part2(cases_df, res_cols, end_event, start_event='A_Accepted'):
    
    app_id_list = list(cases_df['case:concept:name'].unique())

    # TODO:
    # create a return df
    return_list = []

    # extracting prefix for each application
    for app_id in app_id_list:
        
        events_app = cases_df.loc[cases_df['case:concept:name'] == app_id]
        events_app.reset_index(drop=True, inplace=True)

        cur_id = starting_row_id = events_app.loc[events_app['concept:name'] == 'A_Accepted'].index[0]
        pre_events = events_app.iloc[:starting_row_id]
        # TODO: 
        # aggregate events_app from row 0 to starting_row_id
        return_list.append(aggregate_df(pre_events, res_cols=res_cols))
        
        ending_row_id = events_app.loc[events_app['concept:name'] == end_event].index[0] -1
        cur_id += 1
        
        total_events = ending_row_id - starting_row_id - 1


        while cur_id < ending_row_id:
            new_row = events_app.iloc[cur_id]
            # TODO: 
            # add new event row info to the aggregated result
            d = add_to_aggregate(return_list, new_row)
            d['case_progression'] = float(cur_id - starting_row_id)/total_events
            return_list.append(d)

            # Update the return_df -> add new row
            # target y: end_event

            cur_id += 1

    result = pd.DataFrame.from_dict(return_list)
    result['case_outcome'] = end_event
    return result

# Run preprocessing

In [45]:
prefix_cancelled_train = create_prefix_part2(cancelled_train, res_cols, end_event='A_Cancelled')
prefix_cancelled_train.to_csv('../data/prefix_data/prefix_cancelled_train.csv')

  res_dict['trace_duration'] = (np.datetime64(row['time:timestamp']) - np.datetime64(res_dict['first_timestamp'])).item().total_seconds()
  new_row['trace_duration'] = (np.datetime64(new_event['time:timestamp']) - np.datetime64(new_row['first_timestamp'])).item().total_seconds()


In [46]:
prefix_cancelled_test = create_prefix_part2(cancelled_test, res_cols, end_event='A_Cancelled')
prefix_cancelled_train.to_csv('../data/prefix_data/prefix_cancelled_test.csv')

  res_dict['trace_duration'] = (np.datetime64(row['time:timestamp']) - np.datetime64(res_dict['first_timestamp'])).item().total_seconds()
  new_row['trace_duration'] = (np.datetime64(new_event['time:timestamp']) - np.datetime64(new_row['first_timestamp'])).item().total_seconds()


In [47]:
prefix_approved_train = create_prefix_part2(approved_train, res_cols, "A_Pending")
prefix_approved_train.to_csv('../data/prefix_data/prefix_approved_train.csv')

prefix_approved_test = create_prefix_part2(approved_test, res_cols, "A_Pending")
prefix_approved_test.to_csv('../data/prefix_data/prefix_approved_test.csv')

  res_dict['trace_duration'] = (np.datetime64(row['time:timestamp']) - np.datetime64(res_dict['first_timestamp'])).item().total_seconds()
  new_row['trace_duration'] = (np.datetime64(new_event['time:timestamp']) - np.datetime64(new_row['first_timestamp'])).item().total_seconds()
  res_dict['trace_duration'] = (np.datetime64(row['time:timestamp']) - np.datetime64(res_dict['first_timestamp'])).item().total_seconds()
  new_row['trace_duration'] = (np.datetime64(new_event['time:timestamp']) - np.datetime64(new_row['first_timestamp'])).item().total_seconds()


In [49]:
prefix_declined_train = create_prefix_part2(declined_train, res_cols, 'A_Denied')
prefix_declined_train.to_csv('../data/prefix_data/prefix_declined_train.csv')

prefix_declined_test = create_prefix_part2(declined_test, res_cols, 'A_Denied')
prefix_declined_test.to_csv('../data/prefix_data/prefix_declined_test.csv')

  res_dict['trace_duration'] = (np.datetime64(row['time:timestamp']) - np.datetime64(res_dict['first_timestamp'])).item().total_seconds()
  new_row['trace_duration'] = (np.datetime64(new_event['time:timestamp']) - np.datetime64(new_row['first_timestamp'])).item().total_seconds()
  res_dict['trace_duration'] = (np.datetime64(row['time:timestamp']) - np.datetime64(res_dict['first_timestamp'])).item().total_seconds()
  new_row['trace_duration'] = (np.datetime64(new_event['time:timestamp']) - np.datetime64(new_row['first_timestamp'])).item().total_seconds()


In [51]:
prefix_train = pd.concat([prefix_approved_train, prefix_cancelled_train, prefix_declined_train])
prefix_train.to_csv('../data/prefix_data/full_prefix_train.csv')

In [53]:
prefix_test = pd.concat([prefix_approved_test, prefix_cancelled_test, prefix_declined_test])
prefix_test.to_csv('../data/prefix_data/full_prefix_test.csv')