In [219]:
import pandas as pd
import numpy as np
import json

import pm4py
# https://pm4py.fit.fraunhofer.de/documentation
from pm4py.objects.log.obj import EventLog, Trace
from pm4py.objects.log.util.log import project_traces

In [220]:
# function to project the trace
def project_nth(log, index):
    print(str(project_traces(log)[index]))

In [259]:
# read data in csv
trace = pd.read_csv('../data/Travel Permits (filtered).csv')

# data_path = '../data/Travel Permits Filtered.xes'
# trace_log = pm4py.read_xes(data_path)
# trace_log = pm4py.format_dataframe(trace_log, case_id='case:id', activity_key='concept:name', timestamp_key='time:timestamp')
# trace_log = pm4py.convert_to_event_log(trace_log)
# trace_log

  trace = pd.read_csv('../data/Travel Permits (filtered).csv')


## Preparing data for task Declaration REJECTED vs Accapted by Administrator/Employee

- Extract traces only where decleration is submitted by employee is in a trace ( we are not considering other traces where decision of accatped, rejected is not being made /discuss )
- create target variables, if application is rejected or not
- get traces which starts from permit submitted by employee to the trace decleration is submitted by employee (discuss)
- filter traces of length n (10,15,20 decide) 
- choose encoding and decide features to involve which we belive will be there before the activity decleration is submitted


### Filterings

1. Filtering on timeframe (from this date to that)
2. Filter on case performance (traces finished withing 10 days)
3. Filter on start and end activities (give list of start and end)
4. Filter on variants (keeping only frequent trace flows like - [a,b,c,d] and [a d b c], or 0.4 threshold etc)
5. Filter on attributes values (selection and projection of traces)
6. Filter on numeric attribute values (from declared amount 500 to 1000)
7. Between Filter (filtering the activities from say permit apply to permit accapted)
8. case size (number of activiries in case)


### Statistics

1. Throughput Time (time to complete traces)
2. Case Arrival/Dispersion Ratio (arrival time between two traces (not events))
3. Performance Spectrum (time between activities)
4. Cycle Time and Waiting Time (cycle time - time between activities, lead Time - the overall time in which the instance was worked, from the start to the end,accumelative )
5. Sojourn Time - time taken for an activity to complete from the end of previous activity
6. other stats if needed



### feature selection

1. manual feature selection (one hot encoding, numerical features etc.)
2. Calculating useful features (times between activities, bigrams etc)

# Feature Engineering

In [312]:
# check all the columns
trace.columns

Index(['case', 'event', 'startTime', 'completeTime', 'OrganizationalEntity',
       'TotalDeclared', 'Overspent', 'RequestedBudget', 'OverspentAmount',
       'org:resource'],
      dtype='object')

In [313]:
# removing redundant columns and converting time to pandas datetime object
imp_cols = ['case', 'event', 'startTime', 'completeTime', 'OrganizationalEntity',
       'TotalDeclared', 'Overspent',
       'RequestedBudget', 'OverspentAmount', 'org:resource']
permits = trace[imp_cols]
permits['startTime'] = pd.to_datetime(permits['startTime'])
permits['completeTime'] = pd.to_datetime(permits['completeTime'])
permits.head()

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource
0,travel permit 76455,Start trip,2016-10-05 00:00:00,2016-10-05 00:00:00,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
1,travel permit 76455,End trip,2016-10-05 00:00:00,2016-10-05 00:00:00,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
2,travel permit 76455,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,2017-04-06 13:32:10,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
3,travel permit 76455,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,2017-04-06 13:32:28,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
4,travel permit 76455,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,2017-04-07 13:38:14,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER


In [314]:
# add year and month column
permits['year'] = permits['startTime'].dt.year
permits['month'] = permits['startTime'].dt.month

In [315]:
# time to complete the activity
permits['act_completionTime'] = permits['completeTime'] - permits['startTime']
permits['act_completionTime'] = permits['act_completionTime'].dt.total_seconds()

In [316]:
# get minimum time for earch trace (when trace started)
temp_df = pd.DataFrame(permits.groupby(['case'])['startTime'].min())
temp_df['index'] = temp_df.index
temp_df.reset_index(drop=True, inplace=True)
temp_df.columns = ['startTime_min','case']

# merge with permits
permits = permits.merge(temp_df,on=['case'])

In [317]:
# get max time for earch trace (when trace ended)
temp_df = pd.DataFrame(permits.groupby(['case'])['completeTime'].max())
temp_df['index'] = temp_df.index
temp_df.reset_index(drop=True, inplace=True)
temp_df.columns = ['completeTime_max','case']

# merge with permits
permits = permits.merge(temp_df,on=['case'])

In [318]:
# calculate trace time
permits['processCompletionTime']=permits['completeTime_max']-permits['startTime_min']
permits['processCompletionTime'] = permits['processCompletionTime'].dt.total_seconds()

In [323]:
# calculate time taken between activities
act_time_taken=[]
for name,group in permits.groupby(['case'],sort=False):
    act_time_taken.append([name,pd.to_datetime(0) - pd.to_datetime(0)])
    for i in range(len(group)-1):
        st_time_1 = group.iloc[i]['startTime']
        st_time_2 = group.iloc[i+1]['startTime']
        diff = st_time_2-st_time_1
        act_time_taken.append([name,diff])

time_diff_df = pd.DataFrame(act_time_taken,columns=['case','act_time_diff'])

# add with permits
permits['next_act_time_diff'] = time_diff_df['act_time_diff']
permits['next_act_time_diff'] = permits['next_act_time_diff'].dt.total_seconds()

In [324]:
permits.head()

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,year,month,act_completionTime,startTime_min,completeTime_max,processCompletionTime,next_act_time_diff
0,travel permit 76455,Start trip,2016-10-05 00:00:00,2016-10-05 00:00:00,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER,2016,10,0.0,2016-10-05,2017-04-13 17:30:53,16479053.0,0.0
1,travel permit 76455,End trip,2016-10-05 00:00:00,2016-10-05 00:00:00,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER,2016,10,0.0,2016-10-05,2017-04-13 17:30:53,16479053.0,0.0
2,travel permit 76455,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,2017-04-06 13:32:10,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER,2017,4,0.0,2016-10-05,2017-04-13 17:30:53,16479053.0,15859930.0
3,travel permit 76455,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,2017-04-06 13:32:28,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER,2017,4,0.0,2016-10-05,2017-04-13 17:30:53,16479053.0,18.0
4,travel permit 76455,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,2017-04-07 13:38:14,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER,2017,4,0.0,2016-10-05,2017-04-13 17:30:53,16479053.0,86746.0


In [325]:
trace_log = pm4py.format_dataframe(permits, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

  trace_log = pm4py.format_dataframe(permits, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')


In [327]:
# # printing activities in trace 
# for trace in trace_log[0]:
#     print(trace["concept:name"])

In [328]:
dec_sub_traces = []
for trace in trace_log:
    if len(list(filter(lambda e: e["concept:name"] == "Declaration SUBMITTED by EMPLOYEE" ,trace))) > 0:
        dec_sub_traces.append(Trace(trace, attributes = trace.attributes))

dec_sub_traces = EventLog(dec_sub_traces)

In [329]:
declerations = [len(list(filter(lambda e: e["concept:name"] == "Declaration REJECTED by EMPLOYEE" ,trace))) > 0 for trace in trace_log]
declerations = [1 if dec != False else 0 for dec in declerations]
declerations

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,


In [330]:
between_log = pm4py.filter_between(dec_sub_traces, "Permit SUBMITTED by EMPLOYEE", "Declaration SUBMITTED by EMPLOYEE")



In [331]:
project_nth(between_log, 232)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit APPROVED by BUDGET OWNER', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE']


In [332]:
# from pm4py.statistics.sojourn_time.log import get as soj_time_get
# soj_time = soj_time_get.apply(trace_log, parameters={soj_time_get.Parameters.TIMESTAMP_KEY: "time:timestamp", soj_time_get.Parameters.START_TIMESTAMP_KEY: "startTime"})
# print(soj_time)

In [333]:
# # to get all trace cases
# for i, trace in enumerate(trace_log):
#     print(trace[0]['case'])

In [334]:
# # throughput time (time to complete traces) all in seconds
# all_case_durations = pm4py.get_all_case_durations(trace_log)
# all_case_durations

In [335]:
# # arrival between cases
# case_arrival_ratio = pm4py.get_case_arrival_average(trace_log)
# case_arrival_ratio/(60*60)

In [336]:
# from pm4py.objects.log.util import interval_lifecycle
# enriched_log = interval_lifecycle.assign_lead_cycle_time(trace_log)
# enriched_log

In [337]:
# we can get any attribute of trace using this
activities = pm4py.get_event_attribute_values(trace_log, "concept:name")
resources = pm4py.get_event_attribute_values(trace_log, "org:resource")
resources

{'STAFF MEMBER': 29187, 'SYSTEM': 6714}

In [338]:
# remove Return Decleration event
# train_raw = pm4py.filter_event_attribute_values(trace_log, "concept:name", "Declaration REJECTED by EMPLOYEE", level = "event", retain=False)

In [339]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:10], attributes = trace.attributes) for trace in trace_log])

In [340]:
# check the trace length
print([len(trace) for trace in trace_log][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[10, 10, 10, 12, 10, 12, 10, 12, 10, 13, 10, 12, 10, 10, 10]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


## Data encoding

- preparing input data to be passed, 
- there are different encodings we can go with
    - boolean encoding (one hot encoding if the activity present or not)
    - frequency encoding (count of activity)
    - simple index encoding (n events one hot encoding)
    - latest payload encoding (with trace attributes)
    - index payload encoding (n events one hot encoding + with trace attributes)
    - complex index based encoding (static feature + nevents encoding + event features )
    - lstm encoding (m x n)

### Feature selection

In [341]:
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,...,startTime_min,completeTime_max,processCompletionTime,next_act_time_diff,concept:name,time:timestamp,@@index,@@case_index,start_timestamp,case:concept:name
0,travel permit 10066,Permit SUBMITTED by EMPLOYEE,2018-02-21 16:28:17,2018-02-21 16:28:17,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,...,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,0.0,Permit SUBMITTED by EMPLOYEE,2018-02-21 16:28:17,0,0,2018-02-21 16:28:17,travel permit 10066
1,travel permit 10066,Permit APPROVED by ADMINISTRATION,2018-02-21 16:28:19,2018-02-21 16:28:19,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,...,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,2.0,Permit APPROVED by ADMINISTRATION,2018-02-21 16:28:19,1,0,2018-02-21 16:28:19,travel permit 10066
2,travel permit 10066,Start trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,...,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,27101.0,Start trip,2018-02-22 00:00:00,2,0,2018-02-22 00:00:00,travel permit 10066
3,travel permit 10066,End trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,...,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,0.0,End trip,2018-02-22 00:00:00,3,0,2018-02-22 00:00:00,travel permit 10066
4,travel permit 10066,Permit FINAL_APPROVED by SUPERVISOR,2018-02-22 08:27:05,2018-02-22 08:27:05,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,...,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,30425.0,Permit FINAL_APPROVED by SUPERVISOR,2018-02-22 08:27:05,4,0,2018-02-22 08:27:05,travel permit 10066


In [249]:
df.columns

Index(['org:role', 'org:resource', 'id', 'concept:name', 'time:timestamp',
       '@@index', '@@case_index', 'case:OverspentAmount', 'case:BudgetNumber',
       'case:RequestedBudget', 'case:id', 'case:DeclarationNumber_0',
       'case:travel permit number', 'case:Overspent', 'case:RequestedAmount_0',
       'case:TotalDeclared', 'case:ActivityNumber', 'case:dec_id_0',
       'case:TaskNumber', 'case:ProjectNumber', 'case:OrganizationalEntity',
       'case:Project_0', 'case:Task_0', 'case:Cost Type_0', 'case:Rfp_id_0',
       'case:OrganizationalEntity_0', 'case:RfpNumber_0', 'case:Activity_0',
       'case:RequestedAmount_1', 'case:DeclarationNumber_1', 'case:dec_id_1',
       'case:Project_1', 'case:Rfp_id_1', 'case:Task_1', 'case:Cost Type_1',
       'case:OrganizationalEntity_1', 'case:RfpNumber_1', 'case:Activity_1',
       'case:RequestedAmount_2', 'case:DeclarationNumber_2', 'case:dec_id_2',
       'case:DeclarationNumber_3', 'case:RequestedAmount_3', 'case:dec_id_3',
       '

### boolean encoding

In [236]:
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
# selecting features from log, making one hot encoding for activities
data, feature_names = log_to_features.apply(trace_prefixes, parameters={"str_ev_attr": ["concept:name"]})

In [237]:
data[0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0]

In [238]:
len(feature_names)

27

In [239]:
np.shape(np.array(data))

(3137, 27)

In [240]:
project_nth(trace_prefixes, 0)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit FINAL_APPROVED by SUPERVISOR', 'Request For Payment SUBMITTED by EMPLOYEE', 'Request For Payment REJECTED by ADMINISTRATION', 'Request For Payment REJECTED by EMPLOYEE', 'Request For Payment SUBMITTED by EMPLOYEE', 'Request For Payment APPROVED by ADMINISTRATION', 'Request For Payment FINAL_APPROVED by SUPERVISOR', 'Request Payment']


In [241]:
# it will help to get back the one hot encoded vector
feature_names

['event:concept:name@Declaration APPROVED by ADMINISTRATION',
 'event:concept:name@Declaration APPROVED by BUDGET OWNER',
 'event:concept:name@Declaration APPROVED by PRE_APPROVER',
 'event:concept:name@Declaration FINAL_APPROVED by SUPERVISOR',
 'event:concept:name@Declaration REJECTED by ADMINISTRATION',
 'event:concept:name@Declaration REJECTED by EMPLOYEE',
 'event:concept:name@Declaration REJECTED by MISSING',
 'event:concept:name@Declaration REJECTED by PRE_APPROVER',
 'event:concept:name@Declaration REJECTED by SUPERVISOR',
 'event:concept:name@Declaration SUBMITTED by EMPLOYEE',
 'event:concept:name@End trip',
 'event:concept:name@Payment Handled',
 'event:concept:name@Permit APPROVED by ADMINISTRATION',
 'event:concept:name@Permit APPROVED by BUDGET OWNER',
 'event:concept:name@Permit APPROVED by PRE_APPROVER',
 'event:concept:name@Permit FINAL_APPROVED by SUPERVISOR',
 'event:concept:name@Permit SUBMITTED by EMPLOYEE',
 'event:concept:name@Request For Payment APPROVED by ADMI

In [242]:
# look at the unique vectors and their occurrence frequency/count
# just to check which trace was most frequent, just analysis
dist_features = np.unique(data, return_counts= True, axis = 0)
print(dist_features)

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
        1, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
        0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
        0, 1, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        0, 1, 1, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1],
       [0, 0, 0, 1,

In [243]:
# display most freuent feature
most_freq_feature = dist_features[0][np.argmax(dist_features[1])]
most_freq_feature

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1])

In [244]:
# order doesn't matter here
for i in range(0, len(most_freq_feature)):
    if most_freq_feature[i]!=0:
        print(feature_names[i].split("@")[1], end=' , ')

Declaration APPROVED by ADMINISTRATION , Declaration FINAL_APPROVED by SUPERVISOR , Declaration SUBMITTED by EMPLOYEE , End trip , Payment Handled , Permit APPROVED by ADMINISTRATION , Permit FINAL_APPROVED by SUPERVISOR , Permit SUBMITTED by EMPLOYEE , Request Payment , Start trip , 

In [245]:
train_data = {}
train_data['X']=data
train_data['y'] = declerations
train_data['feature_names'] = feature_names

In [246]:
# save json
save_path = '../data/training_data/boolean_encode.json'
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

# # load json
# with open('data.json', 'w') as f:
#     json.dump(data, f)

### boolean bigram encoding

In [161]:
data_2gram, feature_names = log_to_features.apply(trace_prefixes, 
                                                  parameters={"str_ev_attr": [], 
                                                        "str_tr_attr": [], 
                                                        "num_ev_attr": [], 
                                                        "num_tr_attr": [], 
                                                        "str_evsucc_attr": ["concept:name"]})
feature_names

['succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration APPROVED by ADMINISTRATION',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration APPROVED by BUDGET OWNER',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration APPROVED by SUPERVISOR',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration FINAL_APPROVED by SUPERVISOR',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration REJECTED by ADMINISTRATION',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration REJECTED by BUDGET OWNER',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration REJECTED by SUPERVISOR',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Declaration SUBMITTED by EMPLOYEE',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#End trip',
 'succession:concept:name@Declaration APPROVED by ADMINISTRATION#Permit REJECTED by MISSING

In [162]:
data_2gram = np.asarray(data_2gram)

In [163]:
project_nth(trace_prefixes, 0)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit APPROVED by BUDGET OWNER', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION']


In [164]:
print(data_2gram[0])

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
train_data = {}
train_data['X']=data_2gram
train_data['y'] = declerations
train_data['feature_names'] = feature_names

In [165]:
# save json
save_path = '../data/training_data/bigram_boolean_encode.json'
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

### frequency encoding

In [166]:
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(10)



Unnamed: 0,case,event,startTime,completeTime,dec_id_5,dec_id_6,dec_id_3,dec_id_4,dec_id_1,dec_id_2,...,Task_6,org:resource,event_id,org:role,concept:name,time:timestamp,@@index,@@case_index,start_timestamp,case:concept:name
0,travel permit 10022,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,2018-02-20 13:51:27+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10031_0,EMPLOYEE,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,0,0,2018-02-20 13:51:27+00:00,travel permit 10022
1,travel permit 10022,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,2018-02-20 13:51:34+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10030_0,ADMINISTRATION,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,1,0,2018-02-20 13:51:34+00:00,travel permit 10022
2,travel permit 10022,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2018-02-20 16:27:33+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10029_0,BUDGET OWNER,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2,0,2018-02-20 16:27:33+00:00,travel permit 10022
3,travel permit 10022,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,2018-02-21 12:58:49+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10028_0,SUPERVISOR,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,3,0,2018-02-21 12:58:49+00:00,travel permit 10022
4,travel permit 10022,Start trip,2018-03-17 00:00:00+00:00,2018-03-17 00:00:00+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,rv_travel permit 10022_6,EMPLOYEE,Start trip,2018-03-17 00:00:00+00:00,4,0,2018-03-17 00:00:00+00:00,travel permit 10022
5,travel permit 10022,End trip,2018-03-22 00:00:00+00:00,2018-03-22 00:00:00+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,rv_travel permit 10022_7,EMPLOYEE,End trip,2018-03-22 00:00:00+00:00,5,0,2018-03-22 00:00:00+00:00,travel permit 10022
6,travel permit 10022,Declaration SUBMITTED by EMPLOYEE,2018-04-15 16:36:12+00:00,2018-04-15 16:36:12+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10038_0,EMPLOYEE,Declaration SUBMITTED by EMPLOYEE,2018-04-15 16:36:12+00:00,6,0,2018-04-15 16:36:12+00:00,travel permit 10022
7,travel permit 10022,Declaration APPROVED by ADMINISTRATION,2018-04-16 21:05:21+00:00,2018-04-16 21:05:21+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10037_0,ADMINISTRATION,Declaration APPROVED by ADMINISTRATION,2018-04-16 21:05:21+00:00,7,0,2018-04-16 21:05:21+00:00,travel permit 10022
8,travel permit 10022,Declaration SUBMITTED by EMPLOYEE,2018-04-16 21:08:02+00:00,2018-04-16 21:08:02+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10034_0,EMPLOYEE,Declaration SUBMITTED by EMPLOYEE,2018-04-16 21:08:02+00:00,8,0,2018-04-16 21:08:02+00:00,travel permit 10022
9,travel permit 10022,Declaration APPROVED by ADMINISTRATION,2018-04-16 21:14:17+00:00,2018-04-16 21:14:17+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10035_0,ADMINISTRATION,Declaration APPROVED by ADMINISTRATION,2018-04-16 21:14:17+00:00,9,0,2018-04-16 21:14:17+00:00,travel permit 10022


In [167]:
trace_case_act = df.loc[:,["case:concept:name", "concept:name"]]
trace_case_act

Unnamed: 0,case:concept:name,concept:name
0,travel permit 10022,Permit SUBMITTED by EMPLOYEE
1,travel permit 10022,Permit APPROVED by ADMINISTRATION
2,travel permit 10022,Permit APPROVED by BUDGET OWNER
3,travel permit 10022,Permit FINAL_APPROVED by SUPERVISOR
4,travel permit 10022,Start trip
...,...,...
65410,travel permit 9984,Permit APPROVED by ADMINISTRATION
65411,travel permit 9984,Permit FINAL_APPROVED by SUPERVISOR
65412,travel permit 9984,Request For Payment SUBMITTED by EMPLOYEE
65413,travel permit 9984,Request For Payment APPROVED by ADMINISTRATION


In [168]:
# Count the occurrence of activities in a trace (no sorting to keep order of traces stable!)
trace_act_count = trace_case_act.groupby(["case:concept:name", "concept:name"], sort=False).size()
trace_act_count

case:concept:name    concept:name                                    
travel permit 10022  Permit SUBMITTED by EMPLOYEE                        1
                     Permit APPROVED by ADMINISTRATION                   1
                     Permit APPROVED by BUDGET OWNER                     1
                     Permit FINAL_APPROVED by SUPERVISOR                 1
                     Start trip                                          1
                                                                        ..
travel permit 9984   Permit REJECTED by EMPLOYEE                         1
                     Permit FINAL_APPROVED by SUPERVISOR                 1
                     Request For Payment SUBMITTED by EMPLOYEE           1
                     Request For Payment APPROVED by ADMINISTRATION      1
                     Request For Payment FINAL_APPROVED by SUPERVISOR    1
Length: 61985, dtype: int64

In [169]:
trace_act_count.unstack()

concept:name,Permit SUBMITTED by EMPLOYEE,Permit APPROVED by ADMINISTRATION,Permit APPROVED by BUDGET OWNER,Permit FINAL_APPROVED by SUPERVISOR,Start trip,End trip,Declaration SUBMITTED by EMPLOYEE,Declaration APPROVED by ADMINISTRATION,Request For Payment SUBMITTED by EMPLOYEE,Request For Payment REJECTED by ADMINISTRATION,...,Permit FOR_APPROVAL by ADMINISTRATION,Permit APPROVED by PRE_APPROVER,Request For Payment REJECTED by PRE_APPROVER,Request For Payment APPROVED by PRE_APPROVER,Declaration REJECTED by PRE_APPROVER,Declaration APPROVED by PRE_APPROVER,Declaration REJECTED by MISSING,Permit REJECTED by PRE_APPROVER,Request For Payment REJECTED by MISSING,Declaration REJECTED by DIRECTOR
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
travel permit 10022,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,,,...,,,,,,,,,,
travel permit 10040,1.0,1.0,,1.0,,,,,2.0,1.0,...,,,,,,,,,,
travel permit 10060,1.0,1.0,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,,
travel permit 10066,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,,...,,,,,,,,,,
travel permit 10077,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
travel permit 9931,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,,...,,,,,,,,,,
travel permit 9942,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,,...,,,,,,,,,,
travel permit 9953,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,...,,,,,,,,,,
travel permit 9965,3.0,1.0,,,1.0,1.0,,,,,...,,,,,,,,,,


In [170]:
trace_bag = np.asarray(trace_act_count.unstack(fill_value=0))
trace_bag

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [3, 1, 0, ..., 0, 0, 0],
       [2, 2, 0, ..., 0, 0, 0]])

In [171]:
trace_bag.shape

(7065, 50)

In [172]:
project_nth(trace_log, 0)
print(trace_bag[0])

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit APPROVED by BUDGET OWNER', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration APPROVED by BUDGET OWNER', 'Declaration APPROVED by BUDGET OWNER', 'Declaration FINAL_APPROVED by SUPERVISOR', 'Request Payment', 'Declaration FINAL_APPROVED by SUPERVISOR', 'Request Payment', 'Payment Handled', 'Payment Handled']
[1 1 1 1 1 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [180]:
train_data = {}
train_data['X']=trace_bag.tolist()
train_data['y'] = declerations
train_data['feature_names'] = list(trace_act_count.unstack().columns)

In [181]:
# save json
save_path = '../data/training_data/frequency_encoding.json'
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

### index payload encoding (n events one hot encoding + with trace attributes)

In [208]:
temp = pm4py.convert_to_dataframe(trace_prefixes)
temp.head(12)

Unnamed: 0,org:role,org:resource,id,concept:name,time:timestamp,@@index,@@case_index,case:OverspentAmount,case:BudgetNumber,case:RequestedBudget,...,case:OrganizationalEntity_1,case:RfpNumber_1,case:Activity_1,case:RequestedAmount_2,case:DeclarationNumber_2,case:dec_id_2,case:DeclarationNumber_3,case:RequestedAmount_3,case:dec_id_3,case:concept:name
0,EMPLOYEE,STAFF MEMBER,st_step 10050_0,Permit SUBMITTED by EMPLOYEE,2018-02-20 12:53:11+00:00,0,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
1,ADMINISTRATION,STAFF MEMBER,st_step 10049_0,Permit APPROVED by ADMINISTRATION,2018-02-20 12:53:14+00:00,1,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
2,SUPERVISOR,STAFF MEMBER,st_step 10048_0,Permit FINAL_APPROVED by SUPERVISOR,2018-02-20 15:28:52+00:00,2,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
3,EMPLOYEE,STAFF MEMBER,st_step 10056_0,Request For Payment SUBMITTED by EMPLOYEE,2018-02-23 15:48:10+00:00,3,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
4,ADMINISTRATION,STAFF MEMBER,st_step 10054_0,Request For Payment REJECTED by ADMINISTRATION,2018-02-23 15:50:51+00:00,4,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
5,EMPLOYEE,STAFF MEMBER,st_step 10055_0,Request For Payment REJECTED by EMPLOYEE,2018-02-26 08:24:04+00:00,5,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
6,EMPLOYEE,STAFF MEMBER,st_step 10053_0,Request For Payment SUBMITTED by EMPLOYEE,2018-02-26 10:30:14+00:00,6,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
7,ADMINISTRATION,STAFF MEMBER,st_step 10052_0,Request For Payment APPROVED by ADMINISTRATION,2018-02-26 10:30:29+00:00,7,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
8,SUPERVISOR,STAFF MEMBER,st_step 10051_0,Request For Payment FINAL_APPROVED by SUPERVISOR,2018-02-26 14:26:30+00:00,8,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040
9,UNDEFINED,SYSTEM,rp_request for payment 10043_15,Request Payment,2018-03-01 05:26:43+00:00,9,0,-646.643989,budget 1136,2531.512736,...,,,,,,,,NaT,,travel permit 10040


In [211]:
temp['concept:name'].unique()

array(['Permit SUBMITTED by EMPLOYEE',
       'Permit APPROVED by ADMINISTRATION',
       'Permit FINAL_APPROVED by SUPERVISOR',
       'Request For Payment SUBMITTED by EMPLOYEE',
       'Request For Payment REJECTED by ADMINISTRATION',
       'Request For Payment REJECTED by EMPLOYEE',
       'Request For Payment APPROVED by ADMINISTRATION',
       'Request For Payment FINAL_APPROVED by SUPERVISOR',
       'Request Payment', 'Start trip', 'End trip',
       'Declaration SUBMITTED by EMPLOYEE',
       'Declaration APPROVED by ADMINISTRATION',
       'Declaration FINAL_APPROVED by SUPERVISOR', 'Payment Handled',
       'Permit APPROVED by BUDGET OWNER',
       'Declaration APPROVED by BUDGET OWNER',
       'Declaration REJECTED by ADMINISTRATION', 'Send Reminder',
       'Declaration REJECTED by SUPERVISOR',
       'Request For Payment APPROVED by BUDGET OWNER',
       'Permit APPROVED by PRE_APPROVER',
       'Declaration APPROVED by PRE_APPROVER',
       'Declaration REJECTED by MISS

### complex index based encoding (static feature + nevents encoding + event features )

### lstm encoding (m x n)