## Hyperparameter Analysis for Event Logs

In [1]:
# set up
best_values_dt =  {}
best_values_lr =  {}
best_values_svm =  {}
best_values_nn =  {}
best_values_rf =  {}
best_values_xgb =  {}

### Road Traffic Fine Management Process

#### Data Preparation 

In [2]:
import pm4py
log_all = pm4py.read_xes('Road_Traffic_Fine_Management_Process.xes.gz')

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

In [3]:
last_timestamp = max(event["time:timestamp"] for trace in log_all for event in trace) 
print("Last Timestamp:", last_timestamp)
first_timestamp = min(event["time:timestamp"] for trace in log_all for event in trace)
print("First Timestamp:", first_timestamp) 

time_filtered_log = pm4py.filter_time_range(log_all, "2012-01-01 00:00:00", "2013-06-18 00:00:00", mode='traces_contained')

Last Timestamp: 2013-06-18 00:00:00+02:00
First Timestamp: 2000-01-01 00:00:00+01:00


In [4]:
print(f'activities: {list(pm4py.get_event_attribute_values(time_filtered_log,"concept:name").keys())}')
print("Number of different trace variants: ", len(pm4py.get_variants_as_tuples(time_filtered_log)))

activities: ['Create Fine', 'Send Fine', 'Insert Fine Notification', 'Add penalty', 'Send for Credit Collection', 'Payment', 'Insert Date Appeal to Prefecture', 'Send Appeal to Prefecture', 'Receive Result Appeal from Prefecture', 'Appeal to Judge', 'Notify Result Appeal to Offender']
Number of different trace variants:  40


In [5]:
log_start = pm4py.get_start_activities(time_filtered_log)
print(log_start)

end_activities = pm4py.get_end_activities(time_filtered_log)
print(end_activities)

{'Create Fine': 5558}
{'Send Fine': 1351, 'Send for Credit Collection': 169, 'Payment': 3719, 'Receive Result Appeal from Prefecture': 10, 'Send Appeal to Prefecture': 283, 'Notify Result Appeal to Offender': 26}


In [6]:
# We discard less frequent variants in order to reduce
# the number of edge cases.
log = pm4py.filter_variants_top_k(time_filtered_log, 10)
pm4py.write_xes(log, "Road_Traffic_Fine_Management_Process_filtered.xes")
len(log) 

exporting log, completed traces ::   0%|          | 0/5485 [00:00<?, ?it/s]

5485

#### Analysis

In [7]:
import pm4py
log = pm4py.read_xes("Road_Traffic_Fine_Management_Process_filtered.xes")

parsing log, completed traces ::   0%|          | 0/5485 [00:00<?, ?it/s]

In [8]:
from exdpn.petri_net import get_petri_net
net, im, fm = get_petri_net(log, miner_type='IM')

In [9]:
event_attrs = list(pm4py.get_event_attributes(log))
trace_attrs = list(pm4py.get_trace_attributes(log))
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("org:resource")
numeric_attributes = ["amount", "expense", "totalPaymentAmount", "points"]
trace_attrs = [attr for attr in trace_attrs if max(list(pm4py.get_trace_attribute_values(log, attr).values())) != 1 and "ID" not in attr]

#### Hyperparameter Selection

In [10]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe
from exdpn.guards import ML_Technique

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())
total_size = sum(len(dp_dataset_map[key]) for key in dps)

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [11]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    X_train, y_train = [], []
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["Road Traffic Fine Management Process"] = best_values

p_5:	0.8089	 {'min_impurity_decrease': 0}
p_7:	1.0	 {'min_impurity_decrease': 0}
p_14:	0.9486	 {'min_impurity_decrease': 0}
p_3:	0.5834	 {'min_impurity_decrease': 0.01}
p_4:	0.9972	 {'min_impurity_decrease': 0.01}
p_6:	0.9441	 {'min_impurity_decrease': 0}
p_9:	0.9563	 {'min_impurity_decrease': 0.01}
p_12:	0.9322	 {'min_impurity_decrease': 0.1}
p_15:	1.0	 {'min_impurity_decrease': 0}
final value: {'min_impurity_decrease': 0.020129932243921882}


##### Cross-Validation Neural Network

In [12]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10)), 'learning_rate': ('constant', 'invscaling', 'adaptive')}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_

nn_param = {param: best_values[max_ds_key][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["Road Traffic Fine Management Process"] = best_values



Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_5:	0.8077	 {'hidden_layer_sizes': (5, 5), 'learning_rate': 'adaptive'}
p_7:	1.0	 {'hidden_layer_sizes': (10, 10), 'learning_rate': 'invscaling'}


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_14:	0.9486	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
p_3:	0.6274	 {'hidden_layer_sizes': (10, 10), 'learning_rate': 'adaptive'}
p_4:	0.9972	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
p_6:	0.9483	 {'hidden_layer_sizes': (10, 10), 'learning_rate': 'invscaling'}
p_9:	0.9563	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
p_12:	0.8943	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
p_15:	1.0	 {'hidden_layer_sizes': (10, 10), 'learning_rate': 'constant'}
final value: {'hidden_layer_sizes': (10, 10), 'learning_rate': 'constant'}


##### Cross-Validation Logistic Regression:

In [13]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["Road Traffic Fine Management Process"] = best_values

p_5:	0.4969	 {'C': 0.1, 'tol': 0.001}
p_7:	1.0	 {'C': 0.25, 'tol': 0.001}
p_14:	0.9486	 {'C': 0.1, 'tol': 0.001}
p_3:	0.5448	 {'C': 0.5, 'tol': 0.001}
p_4:	0.9972	 {'C': 0.1, 'tol': 0.001}
p_6:	0.9448	 {'C': 0.1, 'tol': 0.001}
p_9:	0.9563	 {'C': 0.1, 'tol': 0.001}
p_12:	0.8904	 {'C': 0.1, 'tol': 0.001}
p_15:	1.0	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.19298126743722602, 'tol': 0.001}


##### Cross-Validation SVM:

In [14]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["Road Traffic Fine Management Process"] = best_values

p_5:	0.4969	 {'C': 0.1, 'tol': 0.001}
p_7:	1.0	 {'C': 0.1, 'tol': 0.001}
p_14:	0.9486	 {'C': 0.1, 'tol': 0.001}
p_3:	0.5448	 {'C': 0.1, 'tol': 0.001}
p_4:	0.9972	 {'C': 0.1, 'tol': 0.001}


Liblinear failed to converge, increase the number of iterations.


p_6:	0.9448	 {'C': 0.5, 'tol': 0.001}
p_9:	0.9563	 {'C': 0.1, 'tol': 0.001}
p_12:	0.8904	 {'C': 0.1, 'tol': 0.001}
p_15:	1.0	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.18744519728975692, 'tol': 0.001}


##### Cross-Validation XGBoost:

In [15]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["Road Traffic Fine Management Process"] = best_values

p_5:	0.8081	 {'max_depth': 2, 'n_estimators': 100}
p_7:	1.0	 {'max_depth': 1, 'n_estimators': 100}
p_14:	0.9486	 {'max_depth': 1, 'n_estimators': 20}
p_3:	0.5568	 {'max_depth': 1, 'n_estimators': 20}
p_4:	0.9972	 {'max_depth': 1, 'n_estimators': 20}
p_6:	0.9474	 {'max_depth': 1, 'n_estimators': 100}
p_9:	0.9563	 {'max_depth': 1, 'n_estimators': 20}
p_12:	0.8904	 {'max_depth': 1, 'n_estimators': 20}
p_15:	1.0	 {'max_depth': 1, 'n_estimators': 20}
final value: {'max_depth': 1, 'n_estimators': 48}


##### Cross-Validation Random Forest:

In [16]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["Road Traffic Fine Management Process"] = best_values

p_5:	0.8089	 {'max_depth': 4, 'min_impurity_decrease': 0}
p_7:	1.0	 {'max_depth': 2, 'min_impurity_decrease': 0}
p_14:	0.9486	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_3:	0.5643	 {'max_depth': 2, 'min_impurity_decrease': 0.01}
p_4:	0.9972	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_6:	0.9441	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_9:	0.9563	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_12:	0.9322	 {'max_depth': 1, 'min_impurity_decrease': 0.05}
p_15:	1.0	 {'max_depth': 1, 'min_impurity_decrease': 0}
final value: {'max_depth': 3, 'min_impurity_decrease': 0.009880430450378638}


--------------------------------------------------------------------------------------------------------------------------------------------------------

### BPI Challenge 2012 

#### Data Preparation

In [68]:
import pm4py
log_all = pm4py.read_xes('financial_log.xes.gz')

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

In [69]:
print(f'activities: {sorted(list(pm4py.get_event_attribute_values(log_all,"concept:name").keys()))}')

activities: ['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'A_REGISTERED', 'A_SUBMITTED', 'O_ACCEPTED', 'O_CANCELLED', 'O_CREATED', 'O_DECLINED', 'O_SELECTED', 'O_SENT', 'O_SENT_BACK', 'W_Afhandelen leads', 'W_Beoordelen fraude', 'W_Completeren aanvraag', 'W_Nabellen incomplete dossiers', 'W_Nabellen offertes', 'W_Valideren aanvraag', 'W_Wijzigen contractgegevens']


In [70]:
# We only look at subtraces of activities starting with an 'A'.
log = pm4py.filter_event_attribute_values(
    log_all,
    "concept:name",
    ['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'A_REGISTERED', 'A_SUBMITTED'],
    level="event",
    retain=True)

In [71]:
pm4py.write_xes(log, "BPI Challenge 2012 only A.xes")

exporting log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

#### Analysis

In [72]:
import pm4py
log = pm4py.read_xes('BPI Challenge 2012 only A.xes')

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

In [73]:
from exdpn.petri_net import get_petri_net
net, im, fm = get_petri_net(log, miner_type='IM')

In [74]:
event_attrs = list(pm4py.get_event_attributes(log))
trace_attrs = list(pm4py.get_trace_attributes(log))
trace_attrs.remove("REG_DATE")
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("concept:name")

#### Hyperparameter Selection

In [75]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())
total_size = sum(len(dp_dataset_map[key]) for key in dps)

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [76]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["BPI Challenge 2012"] = best_values

p_5:	0.8291	 {'min_impurity_decrease': 0.01}
p_7:	0.6865	 {'min_impurity_decrease': 0}
p_4:	0.6175	 {'min_impurity_decrease': 0.01}
p_6:	0.9925	 {'min_impurity_decrease': 0.01}
final value: {'min_impurity_decrease': 0.007500000000000001}


##### Cross-Validation SVM:

In [77]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")

    best_values[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["BPI Challenge 2012"] = best_values

p_5:	0.8266	 {'C': 0.5, 'tol': 0.001}
p_7:	0.7126	 {'C': 0.5, 'tol': 0.001}
p_4:	0.6177	 {'C': 0.1, 'tol': 0.001}
p_6:	0.9925	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.30000000000000004, 'tol': 0.001}


##### Cross-Validation Neural Network:

In [78]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10)), 'learning_rate': ('constant', 'invscaling', 'adaptive')}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_

nn_param = {param: best_values[max_ds_key][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["BPI Challenge 2012"] = best_values

p_5:	0.8301	 {'hidden_layer_sizes': (5, 5), 'learning_rate': 'invscaling'}
p_7:	0.7109	 {'hidden_layer_sizes': (10, 10), 'learning_rate': 'invscaling'}
p_4:	0.6211	 {'hidden_layer_sizes': (5,), 'learning_rate': 'invscaling'}
p_6:	0.9925	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
final value: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}


##### Cross-Validation Logistic Regression:

In [80]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["BPI Challenge 2012"] = best_values

p_5:	0.8268	 {'C': 0.1, 'tol': 0.001}


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


p_7:	0.711	 {'C': 0.25, 'tol': 0.001}
p_4:	0.6207	 {'C': 0.1, 'tol': 0.001}
p_6:	0.9925	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.13749999999999998, 'tol': 0.001}


##### Cross-Validation XGBoost:

In [81]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["BPI Challenge 2012"] = best_values

p_5:	0.8378	 {'max_depth': 2, 'n_estimators': 20}
p_7:	0.7105	 {'max_depth': 2, 'n_estimators': 20}
p_4:	0.6182	 {'max_depth': 4, 'n_estimators': 100}
p_6:	0.9925	 {'max_depth': 1, 'n_estimators': 20}
final value: {'max_depth': 2, 'n_estimators': 40}


##### Cross-Validation Random Forest:

In [82]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["BPI Challenge 2012"] = best_values

p_5:	0.834	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_7:	0.7089	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_4:	0.6179	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_6:	0.9925	 {'max_depth': 1, 'min_impurity_decrease': 0}
final value: {'max_depth': 5, 'min_impurity_decrease': 0.0}


----------------------------------------------------------------------------------

### BPI Challenge 2019

#### Data Preparation

In [26]:
import pm4py
log_all = pm4py.read_xes('BPI_Challenge_2019.xes')

parsing log, completed traces ::   0%|          | 0/251734 [00:00<?, ?it/s]

In [27]:
last_timestamp = max(event["time:timestamp"] for trace in log_all for event in trace)
print("Last Timestamp:", last_timestamp)
first_timestamp = min(event["time:timestamp"] for trace in log_all for event in trace)
print("First Timestamp:", first_timestamp)

time_filtered_log = pm4py.filter_time_range(log_all, "2018-09-01 00:00:00", "2018-12-01 00:00:00", mode='traces_contained')

Last Timestamp: 2020-04-09 21:59:00+00:00
First Timestamp: 1948-01-26 22:59:00+00:00


In [28]:
print(f'activities: {list(pm4py.get_event_attribute_values(time_filtered_log,"concept:name").keys())}')
print("Number of different trace variants: ", len(pm4py.get_variants_as_tuples(time_filtered_log)))

activities: ['Create Purchase Order Item', 'Delete Purchase Order Item', 'Vendor creates invoice', 'Record Goods Receipt', 'Record Invoice Receipt', 'Clear Invoice', 'Remove Payment Block', 'Cancel Goods Receipt', 'Change Quantity', 'Vendor creates debit memo', 'Cancel Invoice Receipt', 'Change Price', 'Receive Order Confirmation', 'Change Storage Location', 'Change Delivery Indicator', 'Block Purchase Order Item', 'Create Purchase Requisition Item', 'Reactivate Purchase Order Item', 'Record Service Entry Sheet', 'SRM: Created', 'SRM: Complete', 'SRM: Awaiting Approval', 'SRM: Document Completed', 'SRM: Ordered', 'SRM: In Transfer to Execution Syst.', 'SRM: Change was Transmitted', 'SRM: Deleted', 'SRM: Transaction Completed', 'Cancel Subsequent Invoice', 'Change Approval for Purchase Order', 'Release Purchase Order', 'Update Order Confirmation', 'Record Subsequent Invoice', 'Change payment term', 'Change Final Invoice Indicator', 'Set Payment Block']
Number of different trace variants

In [29]:
from exdpn.petri_net import get_petri_net
log = pm4py.filter_event_attribute_values(
    time_filtered_log,
    "concept:name",
    ['Create Purchase Order Item', 'Vendor creates invoice', 'Record Goods Receipt', 'Record Invoice Receipt', 
    'Clear Invoice', 'Record Service Entry Sheet', 'Cancel Goods Receipt', 
    'Vendor creates debit memo', 'Cancel Invoice Receipt', 'Change Delivery Indicator', 'Remove Payment Block', 
    'Change Price', 'Delete Purchase Order Item', 'Change Quantity', 
    'Change Final Invoice Indicator', 'Receive Order Confirmation', 'Cancel Subsequent Invoice', 
    'Reactivate Purchase Order Item', 'Update Order Confirmation', 'Block Purchase Order Item', 
    'Change Approval for Purchase Order', 'Release Purchase Order', 'Record Subsequent Invoice', 'Set Payment Block', 
    'Create Purchase Requisition Item', 'Change Storage Location', 'Change Currency', 'Change payment term', 
    'Change Rejection Indicator', 'Release Purchase Requisition'],
    level="event",
    retain=True)

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(log)))



Number of different trace variants - subtraces:  1535


In [30]:
log_start = pm4py.get_start_activities(log)
print(log_start)
filtered_log = pm4py.filter_start_activities(log, 'Create Purchase Order Item')
print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log)))

end_activities = pm4py.get_end_activities(log)
print(end_activities)
filtered_log = pm4py.filter_end_activities(log, ["Clear Invoice"])

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log)))

{'Create Purchase Order Item': 11176, 'Vendor creates invoice': 347, 'Vendor creates debit memo': 3, 'Create Purchase Requisition Item': 14665, 'Release Purchase Order': 1, 'Change Approval for Purchase Order': 13}
Number of different trace variants - subtraces:  959
{'Delete Purchase Order Item': 1850, 'Clear Invoice': 10521, 'Create Purchase Order Item': 1521, 'Cancel Goods Receipt': 175, 'Record Goods Receipt': 4576, 'Record Invoice Receipt': 5316, 'Receive Order Confirmation': 123, 'Change Delivery Indicator': 104, 'Block Purchase Order Item': 17, 'Record Service Entry Sheet': 401, 'Change Approval for Purchase Order': 211, 'Change Quantity': 107, 'Change Storage Location': 15, 'Change Price': 48, 'Remove Payment Block': 1187, 'Release Purchase Order': 1, 'Change payment term': 1, 'Cancel Subsequent Invoice': 4, 'Cancel Invoice Receipt': 1, 'Record Subsequent Invoice': 1, 'Vendor creates invoice': 23, 'Update Order Confirmation': 1, 'Set Payment Block': 1}
Number of different trace

In [31]:
# We discard less frequent variants in order to reduce
# the number of edge cases.
log = pm4py.filter_variants_top_k(filtered_log, 10)

pm4py.write_xes(log, "BPI_Challenge_2019_filtered_top_k.xes")
len(log)

exporting log, completed traces ::   0%|          | 0/8345 [00:00<?, ?it/s]

8345

#### Analysis

In [32]:
import pm4py
log = pm4py.read_xes("BPI_Challenge_2019_filtered_top_k.xes")

parsing log, completed traces ::   0%|          | 0/8345 [00:00<?, ?it/s]

In [33]:
from exdpn.petri_net import get_petri_net
net, im, fm = get_petri_net(log, miner_type='IM')

In [34]:
event_attrs = list(pm4py.get_event_attributes(log))
trace_attrs = list(pm4py.get_trace_attributes(log))
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("org:resource")
event_attrs.remove("User")
trace_attrs = [attr for attr in trace_attrs if max(list(pm4py.get_trace_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
trace_attrs.remove("Name")
trace_attrs.remove("Item")
trace_attrs.remove("Purchasing Document")

#### Hyperparameter Selection

In [35]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe
from exdpn.guards import ML_Technique

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())
total_size = sum(len(dp_dataset_map[key]) for key in dps)

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [36]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["BPI Challenge 2019"] = best_values

source:	0.8621	 {'min_impurity_decrease': 0}
p_4:	0.8753	 {'min_impurity_decrease': 0.01}
p_9:	0.9952	 {'min_impurity_decrease': 0.01}
final value: {'min_impurity_decrease': 0.006666666666666667}


##### Cross-Validaion Neural Network:

In [37]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10)), 'learning_rate': ('constant', 'invscaling', 'adaptive')}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_

nn_param = {param: best_values[max_ds_key][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["BPI Challenge 2019"] = best_values

source:	0.8669	 {'hidden_layer_sizes': (10, 10), 'learning_rate': 'constant'}
p_4:	0.8389	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
p_9:	0.9988	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
final value: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}


##### Cross-Validation Logistic Regression:

In [38]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["BPI Challenge 2019"] = best_values

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


source:	0.8208	 {'C': 0.5, 'tol': 0.001}
p_4:	0.8525	 {'C': 0.1, 'tol': 0.001}
p_9:	0.998	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.23333333333333334, 'tol': 0.0010000000000000002}


##### Cross-Validation SVM:

In [39]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["BPI Challenge 2019"] = best_values

source:	0.8662	 {'C': 0.5, 'tol': 0.001}
p_4:	0.8364	 {'C': 0.1, 'tol': 0.001}
p_9:	0.9983	 {'C': 0.25, 'tol': 0.001}
final value: {'C': 0.2833333333333333, 'tol': 0.0010000000000000002}


##### Cross-Validation XGBoost:

In [40]:
import re
from xgboost import XGBClassifier

parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

# data sets contain special characters which xgb boost cannot handel
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

best_values = {}
for dp in dps: 
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    # remove special characters
    # solution from: https://stackoverflow.com/questions/48645846/pythons-xgoost-valueerrorfeature-names-may-not-contain-or
    X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["BPI Challenge 2019"] = best_values

source:	0.8197	 {'max_depth': 6, 'n_estimators': 150}
p_4:	0.8614	 {'max_depth': 1, 'n_estimators': 20}
p_9:	0.996	 {'max_depth': 2, 'n_estimators': 100}
final value: {'max_depth': 3, 'n_estimators': 90}


##### Cross-Validation Random Forest:

In [41]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["BPI Challenge 2019"] = best_values

source:	0.6672	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_4:	0.836	 {'max_depth': 3, 'min_impurity_decrease': 0}
p_9:	0.9962	 {'max_depth': 6, 'min_impurity_decrease': 0}
final value: {'max_depth': 5, 'min_impurity_decrease': 0.0}


------------------------------------------------------------------------------

### BPI Challenge 2017

#### Data Preparation

In [83]:
import pm4py
log_all = pm4py.read_xes('BPI Challenge 2017.xes.gz')

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

In [84]:
print(f'activities: {list(pm4py.get_event_attribute_values(log_all,"concept:name").keys())}')

activities: ['A_Create Application', 'A_Submitted', 'W_Handle leads', 'W_Complete application', 'A_Concept', 'A_Accepted', 'O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'W_Call after offers', 'A_Complete', 'W_Validate application', 'A_Validating', 'O_Returned', 'W_Call incomplete files', 'A_Incomplete', 'O_Accepted', 'A_Pending', 'A_Denied', 'O_Refused', 'O_Cancelled', 'A_Cancelled', 'O_Sent (online only)', 'W_Assess potential fraud', 'W_Personal Loan collection', 'W_Shortened completion ']


In [85]:
# We only look at subtraces of activities starting with an 'O'.
# Semantically, this means we look at the events corresponding to
# the offer of a trace.
log = pm4py.filter_event_attribute_values(
    log_all,
    "concept:name",
    ['O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Returned',
     'O_Accepted', 'O_Cancelled', 'O_Refused', 'O_Sent (online only)'],
    level="event",
    retain=True)

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(log)))

filtered_log = pm4py.filter_variants(log, [
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Accepted'),
    ('O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Refused'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Refused'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Returned', 'O_Accepted'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Refused'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (online only)', 'O_Returned', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Returned', 'O_Refused')])

print("Number of different trace variants - filtered subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log)))

pm4py.write_xes(filtered_log, "BPIChallenge2017_filtered.xes")

Number of different trace variants - subtraces:  877
Number of different trace variants - filtered subtraces:  10


exporting log, completed traces ::   0%|          | 0/22771 [00:00<?, ?it/s]

#### Analysis

In [86]:
import pm4py
log = pm4py.read_xes('BPIChallenge2017_filtered.xes')

parsing log, completed traces ::   0%|          | 0/22771 [00:00<?, ?it/s]

In [87]:
from exdpn.petri_net import get_petri_net
net, im, fm = get_petri_net(log, miner_type='IM')

In [88]:
trace_attrs = list(pm4py.get_trace_attributes(log))
event_attrs = list(pm4py.get_event_attributes(log))
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("org:resource") 

#### Hyperparameter Selection

In [89]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())
total_size = sum(len(dp_dataset_map[key]) for key in dps)

##### Cross-Validation Decision Tree:

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [90]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["BPI Challenge 2017"] = best_values

p_5:	0.5703	 {'min_impurity_decrease': 0}
p_4:	0.9819	 {'min_impurity_decrease': 0.01}
p_6:	0.7946	 {'min_impurity_decrease': 0.01}
final value: {'min_impurity_decrease': 0.006666666666666667}


##### Cross-Validation SVM:

In [91]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["BPI Challenge 2017"] = best_values

Liblinear failed to converge, increase the number of iterations.


p_5:	0.5502	 {'C': 0.25, 'tol': 0.001}
p_4:	0.9819	 {'C': 0.1, 'tol': 0.001}
p_6:	0.7946	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.15000000000000002, 'tol': 0.001}


##### Cross-Validation Neural Network:

In [93]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10)), 'learning_rate': ('constant', 'invscaling', 'adaptive')}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_

nn_param = {param: best_values[max_ds_key][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["BPI Challenge 2017"] = best_values

p_5:	0.5532	 {'hidden_layer_sizes': (5, 5), 'learning_rate': 'invscaling'}
p_4:	0.9819	 {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}
p_6:	0.7946	 {'hidden_layer_sizes': (5,), 'learning_rate': 'adaptive'}
final value: {'hidden_layer_sizes': (5,), 'learning_rate': 'adaptive'}


##### Cross-Validation Logistic Regression:

In [94]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["BPI Challenge 2017"] = best_values

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


p_5:	0.5501	 {'C': 0.1, 'tol': 0.001}
p_4:	0.9819	 {'C': 0.1, 'tol': 0.001}
p_6:	0.7946	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.09999999999999999, 'tol': 0.001}


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### Cross-Validation XGBoost:

In [95]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["BPI Challenge 2017"] = best_values

p_5:	0.5623	 {'max_depth': 6, 'n_estimators': 150}
p_4:	0.9819	 {'max_depth': 1, 'n_estimators': 20}
p_6:	0.7946	 {'max_depth': 6, 'n_estimators': 150}
final value: {'max_depth': 4, 'n_estimators': 107}


##### Cross-Validation Random Forest:

In [96]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["BPI Challenge 2017"] = best_values

p_5:	0.5501	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_4:	0.9819	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_6:	0.7946	 {'max_depth': 1, 'min_impurity_decrease': 0}
final value: {'max_depth': 1, 'min_impurity_decrease': 0.0}


----------------------------------------------------------------------------------

### Summary

#### Decision Tree:

In [97]:
best_values_dt

{'Road Traffic Fine Management Process': {p_5: {'min_impurity_decrease': 0},
  p_7: {'min_impurity_decrease': 0},
  p_14: {'min_impurity_decrease': 0},
  p_3: {'min_impurity_decrease': 0.01},
  p_4: {'min_impurity_decrease': 0.01},
  p_6: {'min_impurity_decrease': 0},
  p_9: {'min_impurity_decrease': 0.01},
  p_12: {'min_impurity_decrease': 0.1},
  p_15: {'min_impurity_decrease': 0}},
 'BPI Challenge 2019': {source: {'min_impurity_decrease': 0},
  p_4: {'min_impurity_decrease': 0.01},
  p_9: {'min_impurity_decrease': 0.01}},
 'BPI Challenge 2012': {p_5: {'min_impurity_decrease': 0.01},
  p_7: {'min_impurity_decrease': 0},
  p_4: {'min_impurity_decrease': 0.01},
  p_6: {'min_impurity_decrease': 0.01}},
 'BPI Challenge 2017': {p_5: {'min_impurity_decrease': 0},
  p_4: {'min_impurity_decrease': 0.01},
  p_6: {'min_impurity_decrease': 0.01}}}

#### Logistic Regression:

In [98]:
best_values_lr

{'Road Traffic Fine Management Process': {p_5: {'C': 0.1, 'tol': 0.001},
  p_7: {'C': 0.25, 'tol': 0.001},
  p_14: {'C': 0.1, 'tol': 0.001},
  p_3: {'C': 0.5, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001},
  p_9: {'C': 0.1, 'tol': 0.001},
  p_12: {'C': 0.1, 'tol': 0.001},
  p_15: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2019': {source: {'C': 0.5, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_9: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2012': {p_5: {'C': 0.1, 'tol': 0.001},
  p_7: {'C': 0.25, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2017': {p_5: {'C': 0.1, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001}}}

#### SVM:

In [99]:
best_values_svm

{'Road Traffic Fine Management Process': {p_5: {'C': 0.1, 'tol': 0.001},
  p_7: {'C': 0.1, 'tol': 0.001},
  p_14: {'C': 0.1, 'tol': 0.001},
  p_3: {'C': 0.1, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.5, 'tol': 0.001},
  p_9: {'C': 0.1, 'tol': 0.001},
  p_12: {'C': 0.1, 'tol': 0.001},
  p_15: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2019': {source: {'C': 0.5, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_9: {'C': 0.25, 'tol': 0.001}},
 'BPI Challenge 2012': {p_5: {'C': 0.5, 'tol': 0.001},
  p_7: {'C': 0.5, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2017': {p_5: {'C': 0.25, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001}}}

#### Neural Network:

In [100]:
best_values_nn

{'Road Traffic Fine Management Process': {p_5: {'hidden_layer_sizes': (5, 5),
   'learning_rate': 'adaptive'},
  p_7: {'hidden_layer_sizes': (10, 10), 'learning_rate': 'invscaling'},
  p_14: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'},
  p_3: {'hidden_layer_sizes': (10, 10), 'learning_rate': 'adaptive'},
  p_4: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'},
  p_6: {'hidden_layer_sizes': (10, 10), 'learning_rate': 'invscaling'},
  p_9: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'},
  p_12: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'},
  p_15: {'hidden_layer_sizes': (10, 10), 'learning_rate': 'constant'}},
 'BPI Challenge 2019': {source: {'hidden_layer_sizes': (10, 10),
   'learning_rate': 'constant'},
  p_4: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'},
  p_9: {'hidden_layer_sizes': (5,), 'learning_rate': 'constant'}},
 'BPI Challenge 2012': {p_5: {'hidden_layer_sizes': (5, 5),
   'learning_rate': 'invscaling'},
  p_7: {'hi

#### Random Forest:

In [101]:
best_values_rf

{'Road Traffic Fine Management Process': {p_5: {'max_depth': 4,
   'min_impurity_decrease': 0},
  p_7: {'max_depth': 2, 'min_impurity_decrease': 0},
  p_14: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_3: {'max_depth': 2, 'min_impurity_decrease': 0.01},
  p_4: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_6: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_9: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_12: {'max_depth': 1, 'min_impurity_decrease': 0.05},
  p_15: {'max_depth': 1, 'min_impurity_decrease': 0}},
 'BPI Challenge 2019': {source: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_4: {'max_depth': 3, 'min_impurity_decrease': 0},
  p_9: {'max_depth': 6, 'min_impurity_decrease': 0}},
 'BPI Challenge 2012': {p_5: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_7: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_4: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_6: {'max_depth': 1, 'min_impurity_decrease': 0}},
 'BPI Challenge 2017': {p_5: {'max_depth': 1, 'min

#### XGBoost:

In [102]:
best_values_xgb

{'Road Traffic Fine Management Process': {p_5: {'max_depth': 2,
   'n_estimators': 100},
  p_7: {'max_depth': 1, 'n_estimators': 100},
  p_14: {'max_depth': 1, 'n_estimators': 20},
  p_3: {'max_depth': 1, 'n_estimators': 20},
  p_4: {'max_depth': 1, 'n_estimators': 20},
  p_6: {'max_depth': 1, 'n_estimators': 100},
  p_9: {'max_depth': 1, 'n_estimators': 20},
  p_12: {'max_depth': 1, 'n_estimators': 20},
  p_15: {'max_depth': 1, 'n_estimators': 20}},
 'BPI Challenge 2019': {source: {'max_depth': 6, 'n_estimators': 150},
  p_4: {'max_depth': 1, 'n_estimators': 20},
  p_9: {'max_depth': 2, 'n_estimators': 100}},
 'BPI Challenge 2012': {p_5: {'max_depth': 2, 'n_estimators': 20},
  p_7: {'max_depth': 2, 'n_estimators': 20},
  p_4: {'max_depth': 4, 'n_estimators': 100},
  p_6: {'max_depth': 1, 'n_estimators': 20}},
 'BPI Challenge 2017': {p_5: {'max_depth': 6, 'n_estimators': 150},
  p_4: {'max_depth': 1, 'n_estimators': 20},
  p_6: {'max_depth': 6, 'n_estimators': 150}}}