In [74]:
# import libraties
import pandas as pd
import numpy as np
import pickle

import pm4py
from pm4py.objects.log.util.log import project_traces
from pm4py.objects.log.util import interval_lifecycle
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
from pm4py.objects.log.obj import EventLog, Trace

import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm

# For visualization
from sklearn.metrics import roc_curve, auc

from joblib import dump, load

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score
np.random.seed(42)

In [75]:
# function to project the trace
def project_nth(log, index):
    print(str(project_traces(log)[index]))

In [76]:
# trace length and saving path
t_length = 15
save_path_base = '../data/'


In [77]:
# read data in csv 
trace = pd.read_csv('../data/bpi_challenge_offer.csv', sep=',')
trace.head()

Unnamed: 0,Case ID,Activity,Resource,Start Timestamp,Complete Timestamp,Variant,Variant index,lifecycle:transition,EventOrigin,EventID,...,ApplicationType,RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Application_652823628,A_Create Application,User_1,2016-01-01 10:51:15.304,2016-01-01 10:51:15.304,Variant 16,16,complete,Application,Application_652823628,...,New credit,20000.0,,,,,,,,
1,Application_652823628,A_Submitted,User_1,2016-01-01 10:51:15.352,2016-01-01 10:51:15.352,Variant 16,16,complete,Application,ApplState_1582051990,...,New credit,20000.0,,,,,,,,
2,Application_652823628,A_Concept,User_1,2016-01-01 10:52:36.413,2016-01-01 10:52:36.413,Variant 16,16,complete,Application,ApplState_642383566,...,New credit,20000.0,,,,,,,,
3,Application_652823628,A_Accepted,User_52,2016-01-02 12:23:04.299,2016-01-02 12:23:04.299,Variant 16,16,complete,Application,ApplState_99568828,...,New credit,20000.0,,,,,,,,
4,Application_652823628,O_Create Offer,User_52,2016-01-02 12:29:03.994,2016-01-02 12:29:03.994,Variant 16,16,complete,Offer,Offer_148581083,...,New credit,20000.0,20000.0,44.0,True,498.29,True,979.0,20000.0,Offer_148581083


In [78]:
# adding time features month and year
trace['startTime'] = pd.to_datetime(trace['Start Timestamp'])
trace['completeTime'] = pd.to_datetime(trace['Complete Timestamp'])
trace['year'] = trace['startTime'].dt.year
trace['month'] = trace['startTime'].dt.month

# converting dataframe to event log
trace_log = pm4py.format_dataframe(trace, case_id='Case ID', activity_key='Activity', timestamp_key='Complete Timestamp', start_timestamp_key='Start Timestamp')
trace_log = pm4py.convert_to_event_log(trace_log)

# add other time features
trace_log = interval_lifecycle.assign_lead_cycle_time(trace_log)

In [79]:
# extract traces only till the A_Pending or O_Accepted
prefix_traces = []
for trace in trace_log:
    trace_end_flag = False
    for i,event in enumerate(trace):
        if "A_Pending" in event['concept:name']:
            trace_end_flag = True
            i+=1
            break
        if "O_Accepted" in event['concept:name']:
            trace_end_flag = True
    if trace_end_flag:
        prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [80]:
# convert extracted traces to dataframe
trace = pm4py.convert_to_dataframe(prefix_traces)
original_df = trace.copy()
original_df.head()


Unnamed: 0,concept:name,Resource,Start Timestamp,time:timestamp,Variant,Variant index,lifecycle:transition,EventOrigin,EventID,Action,...,year,month,@@index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
0,A_Create Application,User_1,2016-06-02 12:14:26.844000+00:00,2016-06-02 12:14:26.844000+00:00,Variant 5,5,complete,Application,Application_1000158214,Created,...,2016,6,51192,2016-06-02 12:14:26.844000+00:00,0,0,0,0,1.0,Application_1000158214
1,A_Submitted,User_1,2016-06-02 12:14:26.885000+00:00,2016-06-02 12:14:26.885000+00:00,Variant 5,5,complete,Application,ApplState_277536765,statechange,...,2016,6,51193,2016-06-02 12:14:26.885000+00:00,0,0,0,0,1.0,Application_1000158214
2,A_Concept,User_1,2016-06-02 12:15:36.773000+00:00,2016-06-02 12:15:36.773000+00:00,Variant 5,5,complete,Application,ApplState_1547990892,statechange,...,2016,6,51194,2016-06-02 12:15:36.773000+00:00,0,60,60,60,0.0,Application_1000158214
3,A_Accepted,User_32,2016-06-06 10:10:07.694000+00:00,2016-06-06 10:10:07.694000+00:00,Variant 5,5,complete,Application,ApplState_226397461,statechange,...,2016,6,51196,2016-06-06 10:10:07.694000+00:00,0,64500,64500,64440,0.0,Application_1000158214
4,O_Create Offer,User_32,2016-06-06 10:12:02.402000+00:00,2016-06-06 10:12:02.402000+00:00,Variant 5,5,complete,Offer,Offer_927137349,Created,...,2016,6,51197,2016-06-06 10:12:02.402000+00:00,0,64560,64560,60,0.0,Application_1000158214


In [81]:
# get completion time and sort
completion_time_ls = list(trace.groupby(['case:concept:name'])['Start Timestamp'].max())
completion_time_ls = sorted(completion_time_ls)
# split on 70% max time 
train_split_portion = 0.70
val_split_seperation = 0.85
total_data = len(completion_time_ls)
train_len = int(train_split_portion*total_data)
val_len = int(val_split_seperation*total_data)
last_train_completion_time = completion_time_ls[train_len]
last_val_completion_time = completion_time_ls[val_len]
val_start_time = last_train_completion_time
last_train_completion_time,last_val_completion_time

(Timestamp('2016-10-07 10:03:45.525000+0000', tz='UTC'),
 Timestamp('2016-11-22 08:17:31.918000+0000', tz='UTC'))

In [82]:
# take all traces where start dates are after the last_train_completion_time
dtype_list = list(trace.dtypes) # get original types of the columns
train_df = pd.DataFrame(columns = trace.columns)
test_df = pd.DataFrame(columns = trace.columns)
val_df = pd.DataFrame(columns = trace.columns)
train_count,test_count,val_count = 0,0,0
intersecting_traces = []
for name, group in trace.groupby(['case:concept:name'],as_index=False):
    if group['Start Timestamp'].iloc[-1] <= last_train_completion_time:
        train_df = train_df.append(group)
        train_count+=1
    elif group['Start Timestamp'].iloc[-1] <= last_val_completion_time:
        val_df = val_df.append(group)
        val_count+=1        
    else:
        test_df = test_df.append(group)
        test_count+=1


# converting train and test to their original data types
for i,col in enumerate(train_df.columns):
    train_df[col] = train_df[col].astype(dtype_list[i])

for i,col in enumerate(test_df.columns):
    test_df[col] = test_df[col].astype(dtype_list[i])

for i,col in enumerate(val_df.columns):
    val_df[col] = val_df[col].astype(dtype_list[i])


print("train, val and test count")
print(train_count,val_count,test_count)

train, val and test count
3194 684 684


In [83]:
# permits is just for the variable name, as below code is using permit as dataframe name
df_type = 'train'
permits = train_df

In [84]:
# converting dataframe to event log
trace_log = pm4py.format_dataframe(permits, case_id='case:concept:name', activity_key='concept:name', timestamp_key='Start Timestamp', start_timestamp_key='Start Timestamp')
trace_log = pm4py.convert_to_event_log(trace_log)

In [85]:
# to extract target varaible, 
# if event starts with the name A_Pending it is considered as accepted
declerations = []
for trace in trace_log:
    flag = False
    for i,event in enumerate(trace):
        if "A_Pending" in event['concept:name']:
            flag = True
            break
    
    if flag:
        declerations.append(1)
    else:
        declerations.append(0)

In [86]:
# extract traces only till A_Pending
prefix_traces = []
for trace in trace_log:
    for i,event in enumerate(trace):
        if "A_Pending" in event['concept:name']:
            break
    prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [87]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:t_length], attributes = trace.attributes) for trace in prefix_traces])

In [88]:
# check the trace length
print([len(trace) for trace in prefix_traces][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[12, 11, 12, 11, 13, 11, 11, 11, 11, 11, 10, 13, 11, 13, 11]
[12, 11, 12, 11, 13, 11, 11, 11, 11, 11, 10, 13, 11, 13, 11]


In [89]:
# convert logs to dataframe
# final base dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,concept:name,Resource,time:timestamp,Variant,Variant index,lifecycle:transition,EventOrigin,EventID,Action,LoanGoal,...,year,month,@@index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
0,A_Create Application,User_1,2016-06-02 12:14:26.844000+00:00,Variant 5,5,complete,Application,Application_1000158214,Created,Home improvement,...,2016,6,0,2016-06-02 12:14:26.844000+00:00,0,0,0,0,1.0,Application_1000158214
1,A_Submitted,User_1,2016-06-02 12:14:26.885000+00:00,Variant 5,5,complete,Application,ApplState_277536765,statechange,Home improvement,...,2016,6,1,2016-06-02 12:14:26.885000+00:00,0,0,0,0,1.0,Application_1000158214
2,A_Concept,User_1,2016-06-02 12:15:36.773000+00:00,Variant 5,5,complete,Application,ApplState_1547990892,statechange,Home improvement,...,2016,6,2,2016-06-02 12:15:36.773000+00:00,0,60,60,60,0.0,Application_1000158214
3,W_Complete application,User_32,2016-06-06 10:02:16.549000+00:00,Variant 5,5,complete,Workflow,Workitem_518835990,Deleted,Home improvement,...,2016,6,7,2016-06-06 10:02:16.549000+00:00,0,64800,64800,0,0.0,Application_1000158214
4,A_Accepted,User_32,2016-06-06 10:10:07.694000+00:00,Variant 5,5,complete,Application,ApplState_226397461,statechange,Home improvement,...,2016,6,3,2016-06-06 10:10:07.694000+00:00,0,64500,64500,64440,0.0,Application_1000158214


In [90]:
# Create dataframe for Target variable and combine with above trace dataframe
cases = list(df.groupby(['case:concept:name'],sort=False)['case:concept:name'].first().keys())
df_dict = dict()
df_dict['case:concept:name'] = cases
df_dict['acceptedOffer'] = declerations
temp_df = pd.DataFrame(df_dict)

# merge target variable with permits
df = df.merge(temp_df,on=['case:concept:name'])
df.head()

Unnamed: 0,concept:name,Resource,time:timestamp,Variant,Variant index,lifecycle:transition,EventOrigin,EventID,Action,LoanGoal,...,month,@@index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name,acceptedOffer
0,A_Create Application,User_1,2016-06-02 12:14:26.844000+00:00,Variant 5,5,complete,Application,Application_1000158214,Created,Home improvement,...,6,0,2016-06-02 12:14:26.844000+00:00,0,0,0,0,1.0,Application_1000158214,1
1,A_Submitted,User_1,2016-06-02 12:14:26.885000+00:00,Variant 5,5,complete,Application,ApplState_277536765,statechange,Home improvement,...,6,1,2016-06-02 12:14:26.885000+00:00,0,0,0,0,1.0,Application_1000158214,1
2,A_Concept,User_1,2016-06-02 12:15:36.773000+00:00,Variant 5,5,complete,Application,ApplState_1547990892,statechange,Home improvement,...,6,2,2016-06-02 12:15:36.773000+00:00,0,60,60,60,0.0,Application_1000158214,1
3,W_Complete application,User_32,2016-06-06 10:02:16.549000+00:00,Variant 5,5,complete,Workflow,Workitem_518835990,Deleted,Home improvement,...,6,7,2016-06-06 10:02:16.549000+00:00,0,64800,64800,0,0.0,Application_1000158214,1
4,A_Accepted,User_32,2016-06-06 10:10:07.694000+00:00,Variant 5,5,complete,Application,ApplState_226397461,statechange,Home improvement,...,6,3,2016-06-06 10:10:07.694000+00:00,0,64500,64500,64440,0.0,Application_1000158214,1


In [91]:
save_path = save_path_base + df_type +'_trace_len_'+str(t_length)+ '.csv'
df.to_csv(save_path,index=False)

In [92]:
# passed features we want to extract

# str_ev_attr	String attributes at the event level: these are hot-encoded into features that may assume value 0 or value 1.
# str_tr_attr	String attributes at the trace level: these are hot-encoded into features that may assume value 0 or value 1.
# num_ev_attr	Numeric attributes at the event level: these are encoded by including the last value of the attribute among the events of the trace.
# num_tr_attr	Numeric attributes at trace level: these are encoded by including the numerical value.
# str_evsucc_attr	Successions related to the string attributes values at the event level: for example, if we have a trace [A,B,C], it might be important to include not only the presence of the single values A, B and C as features; but also the presence of the directly-follows couples (A,B) and (B,C).
# ================================================

str_ev_attr = ['concept:name']
str_tr_attr = ['LoanGoal','month', 'Accepted','Selected']
num_ev_attr = ['@@approx_bh_partial_lead_time','@@approx_bh_this_wasted_time']
num_tr_attr = ['RequestedAmount','FirstWithdrawalAmount','NumberOfTerms','MonthlyCost','CreditScore','OfferedAmount']

In [93]:
# function to save the data
def save_data(X,y,cat_feature_names, num_feature_names, save_path):
    data_dict = {}
    data_dict['X'] = X
    data_dict['y'] = y
    data_dict['cat_feature_names'] = cat_feature_names
    data_dict['num_feature_names'] = num_feature_names

    # save pickle
    with open(save_path, 'wb') as handle:
        pickle.dump(data_dict, handle)

In [94]:
# load the data from pickle
def load_data(load_path):
    with open(load_path, 'rb') as handle:
        data = pickle.load(handle)
    return data

In [95]:
# function to get the one hot encoded vectors of values
def hotEncode(vars, df):
    hotEncodeList = {}
    for var in vars:
        var_dict = {}
        var_data = sorted(df[var].unique())
        var_len = len(var_data)
        for i,cat in enumerate(var_data):
            var_dict[cat] = [0]*var_len
            var_dict[cat][i] = 1

        hotEncodeList[var] = var_dict

    return hotEncodeList

In [96]:
# padding function for ohe encoding
def cat_padding(vec, t_length, attr_length):
    desired_length = t_length*attr_length
    vec_length = len(vec)
    if vec_length != desired_length:
        pad_vec = [0]*(desired_length-vec_length)
        vec.extend(pad_vec)
    return vec

In [97]:
# padding function for non-ohe encoding
def num_padding(vec, t_length):
    vec_length = len(vec)
    if vec_length != t_length:
        pad_vec = [0]*(t_length-vec_length)
        vec.extend(pad_vec)
    return vec

In [98]:
# create one hot encoding dict fot categorical variables
# variables which we want to be one hot encoded
categorical_vars = str_ev_attr + str_tr_attr
hotEncodeListCat = hotEncode(categorical_vars, original_df)
categorical_vars

['concept:name', 'LoanGoal', 'month', 'Accepted', 'Selected']

In [99]:
# create one hot encoding dict fot numerical variables
# variables which we want to be one hot encoded
numerical_vars = num_ev_attr + num_tr_attr
hotEncodeListNum = hotEncode(numerical_vars, original_df)
numerical_vars

['@@approx_bh_partial_lead_time',
 '@@approx_bh_this_wasted_time',
 'RequestedAmount',
 'FirstWithdrawalAmount',
 'NumberOfTerms',
 'MonthlyCost',
 'CreditScore',
 'OfferedAmount']

In [100]:
# here for each trace we extract ohe vector for activity and sum them up 
# because this encoding only provides count of how many times the activity appears

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(hotEncodeListCat[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(hotEncodeListCat[cat_atr][ca])

        feature_vec.extend(list(str_ev_vec))

    for num_atr in num_ev_attr:
        num_ev_vec = []
        num_ev_vec.extend(list(group[num_atr]))

        # padding
        num_ev_vec = num_padding(num_ev_vec, t_length)
        feature_vec.extend(num_ev_vec)
        
    data.append(feature_vec)


In [101]:
# save results
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, hotEncodeListCat, hotEncodeListNum, save_path)

In [102]:
df_type = 'test'
permits = test_df
test_df.head()

Unnamed: 0,concept:name,Resource,Start Timestamp,time:timestamp,Variant,Variant index,lifecycle:transition,EventOrigin,EventID,Action,...,year,month,@@index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
164,A_Create Application,User_67,2016-11-23 13:48:23.719000+00:00,2016-11-23 13:48:23.719000+00:00,Variant 6,6,complete,Application,Application_1004853526,Created,...,2016,11,133035,2016-11-23 13:48:23.719000+00:00,0,0,0,0,1.0,Application_1004853526
165,A_Concept,User_67,2016-11-23 13:48:23.734000+00:00,2016-11-23 13:48:23.734000+00:00,Variant 6,6,complete,Application,ApplState_587861900,statechange,...,2016,11,133037,2016-11-23 13:48:23.734000+00:00,0,0,0,0,1.0,Application_1004853526
166,A_Accepted,User_67,2016-11-23 13:51:28.368000+00:00,2016-11-23 13:51:28.368000+00:00,Variant 6,6,complete,Application,ApplState_631896455,statechange,...,2016,11,133038,2016-11-23 13:51:28.368000+00:00,0,180,180,180,0.0,Application_1004853526
167,O_Create Offer,User_67,2016-11-23 13:54:00.312000+00:00,2016-11-23 13:54:00.312000+00:00,Variant 6,6,complete,Offer,Offer_196913104,Created,...,2016,11,133039,2016-11-23 13:54:00.312000+00:00,0,300,300,120,0.0,Application_1004853526
168,O_Created,User_67,2016-11-23 13:54:00.822000+00:00,2016-11-23 13:54:00.822000+00:00,Variant 6,6,complete,Offer,OfferState_2017504635,statechange,...,2016,11,133040,2016-11-23 13:54:00.822000+00:00,0,300,300,0,0.0,Application_1004853526


In [103]:
# converting dataframe to event log
trace_log = pm4py.format_dataframe(permits, case_id='case:concept:name', activity_key='concept:name', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

In [104]:
# to extract target varaible, 
# if event starts with the name A_Pending
acceptedOffer = []
for trace in trace_log:
    flag = False
    for i,event in enumerate(trace):
        if "A_Pending" in event['concept:name']:
            flag = True
            break
    
    if flag:
        acceptedOffer.append(1)
    else:
        acceptedOffer.append(0)

In [105]:
# extract traces only till A_Pending
prefix_traces = []
for trace in trace_log:
    for i,event in enumerate(trace):
        if "A_Pending" in event['concept:name']:
            break
    prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [106]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:t_length], attributes = trace.attributes) for trace in prefix_traces])
# check the trace length
print([len(trace) for trace in prefix_traces][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[11, 12, 11, 11, 14, 11, 11, 13, 11, 13, 12, 11, 12, 13, 12]
[11, 12, 11, 11, 14, 11, 11, 13, 11, 13, 12, 11, 12, 13, 12]


In [107]:
# convert logs to dataframe
# final base dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head()

Unnamed: 0,concept:name,Resource,Start Timestamp,Variant,Variant index,lifecycle:transition,EventOrigin,EventID,Action,LoanGoal,...,year,month,@@index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
0,A_Create Application,User_67,2016-11-23 13:48:23.719000+00:00,Variant 6,6,complete,Application,Application_1004853526,Created,Home improvement,...,2016,11,164,2016-11-23 13:48:23.719,0,0,0,0,1.0,Application_1004853526
1,A_Concept,User_67,2016-11-23 13:48:23.734000+00:00,Variant 6,6,complete,Application,ApplState_587861900,statechange,Home improvement,...,2016,11,165,2016-11-23 13:48:23.734,0,0,0,0,1.0,Application_1004853526
2,A_Accepted,User_67,2016-11-23 13:51:28.368000+00:00,Variant 6,6,complete,Application,ApplState_631896455,statechange,Home improvement,...,2016,11,166,2016-11-23 13:51:28.368,0,180,180,180,0.0,Application_1004853526
3,O_Create Offer,User_67,2016-11-23 13:54:00.312000+00:00,Variant 6,6,complete,Offer,Offer_196913104,Created,Home improvement,...,2016,11,167,2016-11-23 13:54:00.312,0,300,300,120,0.0,Application_1004853526
4,O_Created,User_67,2016-11-23 13:54:00.822000+00:00,Variant 6,6,complete,Offer,OfferState_2017504635,statechange,Home improvement,...,2016,11,168,2016-11-23 13:54:00.822,0,300,300,0,0.0,Application_1004853526


In [108]:
# Create dataframe for Target variable and combine with above trace dataframe
cases = list(df.groupby(['case:concept:name'],sort=False)['case:concept:name'].first().keys())
df_dict = dict()
df_dict['case:concept:name'] = cases
df_dict['acceptedOffer'] = acceptedOffer
temp_df = pd.DataFrame(df_dict)

# merge target variable with permits
df = df.merge(temp_df,on=['case:concept:name'])
df.head()

Unnamed: 0,concept:name,Resource,Start Timestamp,Variant,Variant index,lifecycle:transition,EventOrigin,EventID,Action,LoanGoal,...,month,@@index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name,acceptedOffer
0,A_Create Application,User_67,2016-11-23 13:48:23.719000+00:00,Variant 6,6,complete,Application,Application_1004853526,Created,Home improvement,...,11,164,2016-11-23 13:48:23.719,0,0,0,0,1.0,Application_1004853526,1
1,A_Concept,User_67,2016-11-23 13:48:23.734000+00:00,Variant 6,6,complete,Application,ApplState_587861900,statechange,Home improvement,...,11,165,2016-11-23 13:48:23.734,0,0,0,0,1.0,Application_1004853526,1
2,A_Accepted,User_67,2016-11-23 13:51:28.368000+00:00,Variant 6,6,complete,Application,ApplState_631896455,statechange,Home improvement,...,11,166,2016-11-23 13:51:28.368,0,180,180,180,0.0,Application_1004853526,1
3,O_Create Offer,User_67,2016-11-23 13:54:00.312000+00:00,Variant 6,6,complete,Offer,Offer_196913104,Created,Home improvement,...,11,167,2016-11-23 13:54:00.312,0,300,300,120,0.0,Application_1004853526,1
4,O_Created,User_67,2016-11-23 13:54:00.822000+00:00,Variant 6,6,complete,Offer,OfferState_2017504635,statechange,Home improvement,...,11,168,2016-11-23 13:54:00.822,0,300,300,0,0.0,Application_1004853526,1


In [109]:
save_path = save_path_base + df_type +'_trace_len_'+str(t_length)+ '.csv'
df.to_csv(save_path,index=False)

In [110]:
# passed features we want to extract

# str_ev_attr	String attributes at the event level: these are hot-encoded into features that may assume value 0 or value 1.
# str_tr_attr	String attributes at the trace level: these are hot-encoded into features that may assume value 0 or value 1.
# num_ev_attr	Numeric attributes at the event level: these are encoded by including the last value of the attribute among the events of the trace.
# num_tr_attr	Numeric attributes at trace level: these are encoded by including the numerical value.
# str_evsucc_attr	Successions related to the string attributes values at the event level: for example, if we have a trace [A,B,C], it might be important to include not only the presence of the single values A, B and C as features; but also the presence of the directly-follows couples (A,B) and (B,C).
# ================================================

str_ev_attr = ['concept:name']
str_tr_attr = ['LoanGoal','month', 'Accepted','Selected']
num_ev_attr = ['@@approx_bh_partial_lead_time','@@approx_bh_this_wasted_time']
num_tr_attr = ['RequestedAmount','FirstWithdrawalAmount','NumberOfTerms','MonthlyCost','CreditScore','OfferedAmount']

In [111]:
# create one hot encoding dict fot categorical variables
# variables which we want to be one hot encoded
categorical_vars = str_ev_attr + str_tr_attr
hotEncodeListCat = hotEncode(categorical_vars, original_df)
categorical_vars

['concept:name', 'LoanGoal', 'month', 'Accepted', 'Selected']

In [112]:
# create one hot encoding dict fot numerical variables
# variables which we want to be one hot encoded
numerical_vars = num_ev_attr + num_tr_attr
hotEncodeListNum = hotEncode(numerical_vars, original_df)
numerical_vars

['@@approx_bh_partial_lead_time',
 '@@approx_bh_this_wasted_time',
 'RequestedAmount',
 'FirstWithdrawalAmount',
 'NumberOfTerms',
 'MonthlyCost',
 'CreditScore',
 'OfferedAmount']

In [113]:
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
cat_feature_names = loaded_data['cat_feature_names']
num_feature_names = loaded_data['num_feature_names']

In [114]:
# here for each trace we extract ohe vector for activity and sum them up 
# because this encoding only provides count of how many times the activity appears

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(cat_feature_names[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(cat_feature_names[cat_atr][ca])

        feature_vec.extend(list(str_ev_vec))

    for num_atr in num_ev_attr:
        num_ev_vec = []
        num_ev_vec.extend(list(group[num_atr]))

        # padding
        num_ev_vec = num_padding(num_ev_vec, t_length)
        feature_vec.extend(num_ev_vec)
        
    data.append(feature_vec)


In [115]:
# save results
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations,  hotEncodeListCat, hotEncodeListNum ,save_path)

In [116]:
# function to load the data from pickle
def load_data(load_path):
    with open(load_path, 'rb') as handle:
        data = pickle.load(handle)
    return data

In [117]:
data_test = load_data("../data/frequency_encode_test_trace_len_15.pickle")
data_train = load_data("../data/frequency_encode_train_trace_len_15.pickle")
X_train = np.array(data_train['X'])
X_test = np.array(data_test['X'])
y_train = np.array(data_train['y'])
y_test = np.array(data_test['y'])
results_df = pd.DataFrame(index=['Decision Tree (Default)'],
                          columns=['F-score', 'Precision','Recall', 'Accuracy'])

In [118]:
# Create Decision Tree classifier object
dt = DecisionTreeClassifier()

# Train Decision Tree Classifier
dt_fit = dt.fit(X_train,y_train)

#Predict the response for test dataset
dt_predict = dt_fit.predict(X_test)

results_df.loc['Decision Tree (Default)',:] = [f1_score(y_test,dt_predict, average='macro'), precision_score(y_test,dt_predict), recall_score(y_test,dt_predict),accuracy_score(y_test,dt_predict)]

results_df.sort_values(by='F-score', ascending=False)

ValueError: Found input variables with inconsistent numbers of samples: [3194, 684]

In [None]:
tree.plot_tree(dt_fit)