## Hyperparameter Analysis for Event Logs

In [1]:
# set up
best_values_dt =  {}
best_values_lr =  {}
best_values_svm =  {}
best_values_nn =  {}
best_values_rf =  {}
best_values_xgb =  {}

### Road Traffic Fine Management Process

#### Data Preparation 

In [2]:
import pm4py
# source log: https://data.4tu.nl/articles/dataset/Road_Traffic_Fine_Management_Process/12683249/1
log_all_traffic = pm4py.read_xes('Road_Traffic_Fine_Management_Process.xes.gz')

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

In [3]:
last_timestamp_traffic = max(event["time:timestamp"] for trace in log_all_traffic for event in trace) 
print("Last Timestamp:", last_timestamp_traffic)
first_timestamp_traffic = min(event["time:timestamp"] for trace in log_all_traffic for event in trace)
print("First Timestamp:", first_timestamp_traffic) 

time_filtered_log_traffic = pm4py.filter_time_range(log_all_traffic, "2012-01-01 00:00:00", "2013-06-18 00:00:00", mode='traces_contained')

Last Timestamp: 2013-06-18 00:00:00+02:00
First Timestamp: 2000-01-01 00:00:00+01:00


In [4]:
print(f'activities: {list(pm4py.get_event_attribute_values(time_filtered_log_traffic,"concept:name").keys())}')
print("Number of different trace variants: ", len(pm4py.get_variants_as_tuples(time_filtered_log_traffic)))

activities: ['Create Fine', 'Send Fine', 'Insert Fine Notification', 'Add penalty', 'Send for Credit Collection', 'Payment', 'Insert Date Appeal to Prefecture', 'Send Appeal to Prefecture', 'Receive Result Appeal from Prefecture', 'Appeal to Judge', 'Notify Result Appeal to Offender']
Number of different trace variants:  40


In [5]:
log_start_traffic = pm4py.get_start_activities(time_filtered_log_traffic)
print(log_start_traffic)

end_activities_traffic = pm4py.get_end_activities(time_filtered_log_traffic)
print(end_activities_traffic)

{'Create Fine': 5558}
{'Send Fine': 1351, 'Send for Credit Collection': 169, 'Payment': 3719, 'Receive Result Appeal from Prefecture': 10, 'Send Appeal to Prefecture': 283, 'Notify Result Appeal to Offender': 26}


In [6]:
# We discard less frequent variants in order to reduce
# the number of edge cases.
log_top_k_traffic = pm4py.filter_variants_top_k(time_filtered_log_traffic, 10)
pm4py.write_xes(log_top_k_traffic, "Road_Traffic_Fine_Management_Process_filtered.xes")
len(log_top_k_traffic) 

exporting log, completed traces ::   0%|          | 0/5485 [00:00<?, ?it/s]

5485

#### Analysis

In [7]:
import pm4py
log_traffic = pm4py.read_xes("Road_Traffic_Fine_Management_Process_filtered.xes")

parsing log, completed traces ::   0%|          | 0/5485 [00:00<?, ?it/s]

In [14]:
numeric_attributes_traffic = ["amount", "expense", "totalPaymentAmount", "points"]
for attr in numeric_attributes_traffic:
    for trace in log_traffic:
        for event in trace:
            try:
                event[f'{attr}_num'] = float(event[attr])
            except KeyError:
                pass

In [16]:
from exdpn.petri_net import get_petri_net
net_traffic, im_traffic, fm_traffic = get_petri_net(log_traffic, miner_type='IM')

In [17]:
event_attrs_traffic = list(pm4py.get_event_attributes(log_traffic))
trace_attrs_traffic = list(pm4py.get_trace_attributes(log_traffic))
event_attrs_traffic = [attr for attr in event_attrs_traffic if max(list(pm4py.get_event_attribute_values(log_traffic, attr).values())) != 1 and "ID" not in attr]
event_attrs_traffic.remove("time:timestamp")
event_attrs_traffic.remove("org:resource")
for attr in numeric_attributes_traffic:
    event_attrs_traffic.remove(attr)
trace_attrs_traffic = [attr for attr in trace_attrs_traffic if max(list(pm4py.get_trace_attribute_values(log_traffic, attr).values())) != 1 and "ID" not in attr]

#### Hyperparameter Selection

In [18]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe
from exdpn.guards import ML_Technique

# prepare data
dp_dataset_map_traffic = extract_all_datasets(log= log_traffic, net=net_traffic, initial_marking=im_traffic, final_marking=fm_traffic, 
                                      event_level_attributes = event_attrs_traffic,
                                      case_level_attributes=trace_attrs_traffic)

# decision points 
dps_traffic = list(dp_dataset_map_traffic.keys())
total_size_traffic = sum(len(dp_dataset_map_traffic[key]) for key in dps_traffic)

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [28]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values_dt_traffic = {}

for dp in dps_traffic:
    X_train, y_train = [], []
    dp_dataset = dp_dataset_map_traffic[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values_dt_traffic[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map_traffic[key]) for key,val in best_values_dt_traffic.items())/total_size_traffic for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["Road Traffic Fine Management Process"] = best_values_dt_traffic

p_5:	0.8089	 {'min_impurity_decrease': 0}
p_3:	0.5834	 {'min_impurity_decrease': 0.01}
p_12:	0.9322	 {'min_impurity_decrease': 0.1}
p_14:	0.9486	 {'min_impurity_decrease': 0}
p_15:	1.0	 {'min_impurity_decrease': 0}
p_4:	0.9972	 {'min_impurity_decrease': 0.01}
p_6:	0.9408	 {'min_impurity_decrease': 0.01}
p_7:	1.0	 {'min_impurity_decrease': 0}
p_9:	0.9563	 {'min_impurity_decrease': 0.01}
final value: {'min_impurity_decrease': 0.022316062176165805}


##### Cross-Validation Neural Network

In [30]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10))}

best_values_nn_traffic = {}
max_ds_size_traffic = -1
max_ds_key_traffic = None

for dp in dps_traffic:
    dp_dataset = dp_dataset_map_traffic[dp]
    if len(dp_dataset) > max_ds_size_traffic: max_ds_key_traffic = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values_nn_traffic[dp] = nn_grid.best_params_

nn_param = {param: best_values_nn_traffic[max_ds_key_traffic][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["Road Traffic Fine Management Process"] = best_values_nn_traffic

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_5:	0.8077	 {'hidden_layer_sizes': (5, 5)}


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_3:	0.6091	 {'hidden_layer_sizes': (10, 10)}
p_12:	0.8914	 {'hidden_layer_sizes': (5,)}


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_14:	0.9486	 {'hidden_layer_sizes': (5,)}


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_15:	1.0	 {'hidden_layer_sizes': (5, 5)}
p_4:	0.9972	 {'hidden_layer_sizes': (5,)}
p_6:	0.9484	 {'hidden_layer_sizes': (10, 10)}
p_7:	1.0	 {'hidden_layer_sizes': (10, 10)}
p_9:	0.9532	 {'hidden_layer_sizes': (10, 10)}
final value: {'hidden_layer_sizes': (10, 10)}


##### Cross-Validation Logistic Regression:

In [32]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_lr_traffic = {}

for dp in dps_traffic:
    dp_dataset = dp_dataset_map_traffic[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values_lr_traffic[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map_traffic[key]) for key,val in best_values_lr_traffic.items())/total_size_traffic for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["Road Traffic Fine Management Process"] = best_values_lr_traffic

p_5:	0.4969	 {'C': 0.1, 'tol': 0.001}
p_3:	0.5448	 {'C': 0.5, 'tol': 0.001}
p_12:	0.8904	 {'C': 0.1, 'tol': 0.001}
p_14:	0.9486	 {'C': 0.1, 'tol': 0.001}
p_15:	1.0	 {'C': 0.1, 'tol': 0.001}
p_4:	0.9972	 {'C': 0.1, 'tol': 0.001}
p_6:	0.9448	 {'C': 0.1, 'tol': 0.001}
p_7:	1.0	 {'C': 0.25, 'tol': 0.001}
p_9:	0.9563	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.19298126743722596, 'tol': 0.0009999999999999998}


##### Cross-Validation SVM:

In [33]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_svm_traffic = {}

for dp in dps_traffic:
    dp_dataset = dp_dataset_map_traffic[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values_svm_traffic[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map_traffic[key]) for key,val in best_values_svm_traffic.items())/total_size_traffic for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["Road Traffic Fine Management Process"] = best_values_svm_traffic

p_5:	0.4969	 {'C': 0.1, 'tol': 0.001}
p_3:	0.5448	 {'C': 0.1, 'tol': 0.001}
p_12:	0.8904	 {'C': 0.1, 'tol': 0.001}
p_14:	0.9486	 {'C': 0.1, 'tol': 0.001}
p_15:	1.0	 {'C': 0.1, 'tol': 0.001}
p_4:	0.9972	 {'C': 0.1, 'tol': 0.001}


Liblinear failed to converge, increase the number of iterations.


p_6:	0.9469	 {'C': 0.5, 'tol': 0.0005}
p_7:	1.0	 {'C': 0.1, 'tol': 0.001}
p_9:	0.9563	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.18744519728975692, 'tol': 0.0008906935033878038}


##### Cross-Validation XGBoost:

In [35]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

best_values_xgb_traffic = {}

for dp in dps_traffic:
    dp_dataset = dp_dataset_map_traffic[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values_xgb_traffic[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map_traffic[key]) for key,val in best_values_xgb_traffic.items())/total_size_traffic) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["Road Traffic Fine Management Process"] = best_values_xgb_traffic

p_5:	0.8081	 {'max_depth': 2, 'n_estimators': 100}
p_3:	0.5568	 {'max_depth': 1, 'n_estimators': 20}
p_12:	0.8904	 {'max_depth': 1, 'n_estimators': 20}
p_14:	0.9486	 {'max_depth': 1, 'n_estimators': 20}
p_15:	1.0	 {'max_depth': 1, 'n_estimators': 20}
p_4:	0.9972	 {'max_depth': 1, 'n_estimators': 20}
p_6:	0.9474	 {'max_depth': 1, 'n_estimators': 100}
p_7:	1.0	 {'max_depth': 1, 'n_estimators': 100}
p_9:	0.9563	 {'max_depth': 1, 'n_estimators': 20}
final value: {'max_depth': 1, 'n_estimators': 48}


##### Cross-Validation Random Forest:

In [37]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values_rf_traffic = {}

for dp in dps_traffic:
    dp_dataset = dp_dataset_map_traffic[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values_rf_traffic[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map_traffic[key]) for key,val in best_values_rf_traffic.items())/total_size_traffic for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["Road Traffic Fine Management Process"] = best_values_rf_traffic

p_5:	0.8089	 {'max_depth': 4, 'min_impurity_decrease': 0}
p_3:	0.5789	 {'max_depth': 3, 'min_impurity_decrease': 0.01}
p_12:	0.9322	 {'max_depth': 1, 'min_impurity_decrease': 0.1}
p_14:	0.9486	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_15:	1.0	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_4:	0.9972	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_6:	0.944	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_7:	1.0	 {'max_depth': 2, 'min_impurity_decrease': 0}
p_9:	0.9563	 {'max_depth': 1, 'min_impurity_decrease': 0}
final value: {'max_depth': 3, 'min_impurity_decrease': 0.017574730968513353}


--------------------------------------------------------------------------------------------------------------------------------------------------------

### BPI Challenge 2012 

#### Data Preparation

In [38]:
import pm4py
# source log: https://data.4tu.nl/articles/dataset/BPI_Challenge_2012/12689204/1
log_all_bpi_2012 = pm4py.read_xes('BPI_Challenge_2012.xes')

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

In [39]:
print(f'activities: {sorted(list(pm4py.get_event_attribute_values(log_all_bpi_2012,"concept:name").keys()))}')

activities: ['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'A_REGISTERED', 'A_SUBMITTED', 'O_ACCEPTED', 'O_CANCELLED', 'O_CREATED', 'O_DECLINED', 'O_SELECTED', 'O_SENT', 'O_SENT_BACK', 'W_Afhandelen leads', 'W_Beoordelen fraude', 'W_Completeren aanvraag', 'W_Nabellen incomplete dossiers', 'W_Nabellen offertes', 'W_Valideren aanvraag', 'W_Wijzigen contractgegevens']


In [40]:
# We only look at subtraces of activities starting with an 'A'.
log_bpi_a_2012 = pm4py.filter_event_attribute_values(
    log_all_bpi_2012,
    "concept:name",
    ['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'A_REGISTERED', 'A_SUBMITTED'],
    level="event",
    retain=True)

In [41]:
pm4py.write_xes(log_bpi_a_2012, "BPI_Challenge_2012_only_A.xes")

exporting log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

#### Analysis

In [42]:
import pm4py
log_bpi_2012 = pm4py.read_xes('BPI_Challenge_2012_only_A.xes')

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

In [43]:
for trace in log_all_bpi_2012:
    trace.attributes['AMOUNT_REQ_NUM'] = float(trace.attributes['AMOUNT_REQ'])

In [44]:
from exdpn.petri_net import get_petri_net
net_bpi_2012, im_bpi_2012, fm_bpi_2012 = get_petri_net(log_bpi_2012, miner_type='IM')

In [45]:
event_attrs_bpi_2012 = list(pm4py.get_event_attributes(log_bpi_2012))
trace_attrs_bpi_2012 = list(pm4py.get_trace_attributes(log_bpi_2012))
trace_attrs_bpi_2012.remove("REG_DATE")
trace_attrs_bpi_2012.remove("AMOUNT_REQ")
event_attrs_bpi_2012 = [attr for attr in event_attrs_bpi_2012 if max(list(pm4py.get_event_attribute_values(log_bpi_2012, attr).values())) != 1 and "ID" not in attr]
event_attrs_bpi_2012.remove("time:timestamp")
event_attrs_bpi_2012.remove("concept:name")

#### Hyperparameter Selection

In [46]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe

# prepare data
dp_dataset_map_bpi_2012 = extract_all_datasets(log= log_bpi_2012, net=net_bpi_2012, initial_marking=im_bpi_2012, final_marking=fm_bpi_2012, 
                                      event_level_attributes = event_attrs_bpi_2012,
                                      case_level_attributes=trace_attrs_bpi_2012)

# decision points 
dps_bpi_2012 = list(dp_dataset_map_bpi_2012.keys())
total_size_bpi_2012 = sum(len(dp_dataset_map_bpi_2012[key]) for key in dps_bpi_2012)

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [47]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values_dt_bpi_2012 = {}

for dp in dps_bpi_2012:
    dp_dataset = dp_dataset_map_bpi_2012[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values_dt_bpi_2012[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2012[key]) for key,val in best_values_dt_bpi_2012.items())/total_size_bpi_2012 for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["BPI Challenge 2012"] = best_values_dt_bpi_2012

p_5:	0.8291	 {'min_impurity_decrease': 0.01}
p_6:	0.9925	 {'min_impurity_decrease': 0}
p_4:	0.4055	 {'min_impurity_decrease': 0}
p_7:	0.7109	 {'min_impurity_decrease': 0}
final value: {'min_impurity_decrease': 0.0025}


##### Cross-Validation SVM:

In [48]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_svm_bpi_2012 = {}

for dp in dps_bpi_2012:
    dp_dataset = dp_dataset_map_bpi_2012[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")

    best_values_svm_bpi_2012[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2012[key]) for key,val in best_values_svm_bpi_2012.items())/total_size_bpi_2012 for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["BPI Challenge 2012"] = best_values_svm_bpi_2012

p_5:	0.826	 {'C': 0.1, 'tol': 0.001}
p_6:	0.9925	 {'C': 0.1, 'tol': 0.001}
p_4:	0.4055	 {'C': 0.1, 'tol': 0.001}
p_7:	0.7127	 {'C': 0.5, 'tol': 0.001}
final value: {'C': 0.2, 'tol': 0.001}


##### Cross-Validation Neural Network:

In [49]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10))}

best_values_nn_bpi_2012 = {}
max_ds_size_bpi_2012 = -1
max_ds_key_bpi_2012 = None

for dp in dps_bpi_2012:
    dp_dataset = dp_dataset_map_bpi_2012[dp]
    if len(dp_dataset) > max_ds_size_bpi_2012: max_ds_key_bpi_2012 = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values_nn_bpi_2012[dp] = nn_grid.best_params_

nn_param = {param: best_values_nn_bpi_2012[max_ds_key_bpi_2012][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["BPI Challenge 2012"] = best_values_nn_bpi_2012

p_5:	0.8261	 {'hidden_layer_sizes': (5, 5)}
p_6:	0.9925	 {'hidden_layer_sizes': (5,)}
p_4:	0.4055	 {'hidden_layer_sizes': (5,)}
p_7:	0.7138	 {'hidden_layer_sizes': (10, 10)}
final value: {'hidden_layer_sizes': (10, 10)}


##### Cross-Validation Logistic Regression:

In [50]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_lr_bpi_2012 = {}

for dp in dps_bpi_2012:
    dp_dataset = dp_dataset_map_bpi_2012[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values_lr_bpi_2012[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2012[key]) for key,val in best_values_lr_bpi_2012.items())/total_size_bpi_2012 for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["BPI Challenge 2012"] = best_values_lr_bpi_2012

p_5:	0.8265	 {'C': 0.1, 'tol': 0.001}
p_6:	0.9925	 {'C': 0.1, 'tol': 0.001}
p_4:	0.4055	 {'C': 0.1, 'tol': 0.001}
p_7:	0.7106	 {'C': 0.5, 'tol': 0.001}
final value: {'C': 0.2, 'tol': 0.001}


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### Cross-Validation XGBoost:

In [51]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

best_values_xgb_bpi_2012 = {}

for dp in dps_bpi_2012:
    dp_dataset = dp_dataset_map_bpi_2012[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values_xgb_bpi_2012[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map_bpi_2012[key]) for key,val in best_values_xgb_bpi_2012.items())/total_size_bpi_2012) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["BPI Challenge 2012"] = best_values_xgb_bpi_2012

p_5:	0.8291	 {'max_depth': 1, 'n_estimators': 20}
p_6:	0.9925	 {'max_depth': 1, 'n_estimators': 20}
p_4:	0.4055	 {'max_depth': 1, 'n_estimators': 20}
p_7:	0.7118	 {'max_depth': 6, 'n_estimators': 100}
final value: {'max_depth': 2, 'n_estimators': 40}


##### Cross-Validation Random Forest:

In [52]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values_rf_bpi_2012 = {}

for dp in dps_bpi_2012:
    dp_dataset = dp_dataset_map_bpi_2012[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values_rf_bpi_2012[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2012[key]) for key,val in best_values_rf_bpi_2012.items())/total_size_bpi_2012 for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["BPI Challenge 2012"] = best_values_rf_bpi_2012

p_5:	0.8291	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_6:	0.9925	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_4:	0.4055	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_7:	0.7072	 {'max_depth': 6, 'min_impurity_decrease': 0}
final value: {'max_depth': 2, 'min_impurity_decrease': 0.0}


----------------------------------------------------------------------------------

### BPI Challenge 2019

#### Data Preparation

In [56]:
import pm4py
# source log: https://data.4tu.nl/articles/dataset/BPI_Challenge_2019/12715853/1
log_all_bpi_2019 = pm4py.read_xes('BPI_Challenge_2019.xes')

parsing log, completed traces ::   0%|          | 0/251734 [00:00<?, ?it/s]

In [57]:
last_timestamp_bpi_2019 = max(event["time:timestamp"] for trace in log_all_bpi_2019 for event in trace)
print("Last Timestamp:", last_timestamp_bpi_2019)
first_timestamp_bpi_2019 = min(event["time:timestamp"] for trace in log_all_bpi_2019 for event in trace)
print("First Timestamp:", first_timestamp_bpi_2019)

time_filtered_log_bpi_2019 = pm4py.filter_time_range(log_all_bpi_2019, "2018-09-01 00:00:00", "2019-01-01 00:00:00", mode='traces_contained')

Last Timestamp: 2020-04-09 21:59:00+00:00
First Timestamp: 1948-01-26 22:59:00+00:00


In [58]:
print(f'activities: {list(pm4py.get_event_attribute_values(time_filtered_log_bpi_2019,"concept:name").keys())}')
print("Number of different trace variants: ", len(pm4py.get_variants_as_tuples(time_filtered_log_bpi_2019)))

activities: ['Create Purchase Order Item', 'Delete Purchase Order Item', 'Vendor creates invoice', 'Record Goods Receipt', 'Record Invoice Receipt', 'Clear Invoice', 'Remove Payment Block', 'Cancel Goods Receipt', 'Change Quantity', 'Vendor creates debit memo', 'Cancel Invoice Receipt', 'Change Price', 'Receive Order Confirmation', 'Change Storage Location', 'Change Delivery Indicator', 'Block Purchase Order Item', 'Create Purchase Requisition Item', 'Reactivate Purchase Order Item', 'Update Order Confirmation', 'Record Service Entry Sheet', 'SRM: Created', 'SRM: Complete', 'SRM: Awaiting Approval', 'SRM: Document Completed', 'SRM: Ordered', 'SRM: In Transfer to Execution Syst.', 'SRM: Change was Transmitted', 'SRM: Deleted', 'SRM: Transaction Completed', 'Cancel Subsequent Invoice', 'Change Approval for Purchase Order', 'Set Payment Block', 'Release Purchase Order', 'Record Subsequent Invoice', 'Change payment term', 'Change Final Invoice Indicator', 'Release Purchase Requisition']
Nu

In [59]:
from exdpn.petri_net import get_petri_net
time_filtered_log_sub_bpi_2019 = pm4py.filter_event_attribute_values(
    time_filtered_log_bpi_2019,
    "concept:name",
    ['Create Purchase Order Item', 'Vendor creates invoice', 'Record Goods Receipt', 'Record Invoice Receipt', 
    'Clear Invoice', 'Record Service Entry Sheet', 'Cancel Goods Receipt', 
    'Vendor creates debit memo', 'Cancel Invoice Receipt', 'Change Delivery Indicator', 'Remove Payment Block', 
    'Change Price', 'Delete Purchase Order Item', 'Change Quantity', 
    'Change Final Invoice Indicator', 'Receive Order Confirmation', 'Cancel Subsequent Invoice', 
    'Reactivate Purchase Order Item', 'Update Order Confirmation', 'Block Purchase Order Item', 
    'Change Approval for Purchase Order', 'Release Purchase Order', 'Record Subsequent Invoice', 'Set Payment Block', 
    'Create Purchase Requisition Item', 'Change Storage Location', 'Change Currency', 'Change payment term', 
    'Change Rejection Indicator', 'Release Purchase Requisition'],
    level="event",
    retain=True)

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(time_filtered_log_sub_bpi_2019)))

Number of different trace variants - subtraces:  2408


In [60]:
log_start_bpi_2019 = pm4py.get_start_activities(time_filtered_log_sub_bpi_2019)
print(log_start_bpi_2019)
filtered_log_start_bpi_2019 = pm4py.filter_start_activities(time_filtered_log_sub_bpi_2019, 'Create Purchase Order Item')
print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log_start_bpi_2019)))

end_activities_bpi_2019 = pm4py.get_end_activities(filtered_log_start_bpi_2019)
print(end_activities_bpi_2019)
filtered_log_bpi_2019 = pm4py.filter_end_activities(filtered_log_start_bpi_2019, ["Clear Invoice"])

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log_bpi_2019)))

{'Create Purchase Order Item': 19257, 'Vendor creates invoice': 464, 'Vendor creates debit memo': 5, 'Create Purchase Requisition Item': 26600, 'Release Purchase Order': 9, 'Change Approval for Purchase Order': 28}
Number of different trace variants - subtraces:  1382
{'Delete Purchase Order Item': 838, 'Create Purchase Order Item': 2482, 'Clear Invoice': 4535, 'Cancel Goods Receipt': 294, 'Record Goods Receipt': 3248, 'Record Invoice Receipt': 4915, 'Cancel Invoice Receipt': 32, 'Receive Order Confirmation': 103, 'Change Delivery Indicator': 36, 'Block Purchase Order Item': 17, 'Vendor creates invoice': 15, 'Remove Payment Block': 1184, 'Record Service Entry Sheet': 545, 'Change Price': 225, 'Change Approval for Purchase Order': 652, 'Change Quantity': 116, 'Change Storage Location': 14, 'Change payment term': 1, 'Cancel Subsequent Invoice': 4, 'Update Order Confirmation': 1}
Number of different trace variants - subtraces:  311


In [61]:
# We discard less frequent variants in order to reduce
# the number of edge cases.
filtered_log_top_k_bpi_2019 = pm4py.filter_variants_top_k(filtered_log_bpi_2019, 10)

pm4py.write_xes(filtered_log_top_k_bpi_2019, "BPI_Challenge_2019_filtered_top_k.xes")

exporting log, completed traces ::   0%|          | 0/3928 [00:01<?, ?it/s]

#### Analysis

In [62]:
import pm4py
log_bpi_2019 = pm4py.read_xes("BPI_Challenge_2019_filtered_top_k.xes")

parsing log, completed traces ::   0%|          | 0/3928 [00:00<?, ?it/s]

In [64]:
from exdpn.petri_net import get_petri_net
net_bpi_2019, im_bpi_2019, fm_bpi_2019 = get_petri_net(log_bpi_2019, miner_type='IM')

In [65]:
event_attrs_bpi_2019 = list(pm4py.get_event_attributes(log_bpi_2019))
trace_attrs_bpi_2019 = list(pm4py.get_trace_attributes(log_bpi_2019))
event_attrs_bpi_2019 = [attr for attr in event_attrs_bpi_2019 if max(list(pm4py.get_event_attribute_values(log_bpi_2019, attr).values())) != 1 and "ID" not in attr]
event_attrs_bpi_2019.remove("time:timestamp")
event_attrs_bpi_2019.remove("org:resource")
event_attrs_bpi_2019.remove("User")
trace_attrs_bpi_2019 = [attr for attr in trace_attrs_bpi_2019 if max(list(pm4py.get_trace_attribute_values(log_bpi_2019, attr).values())) != 1 and "ID" not in attr]
trace_attrs_bpi_2019.remove("Name")
trace_attrs_bpi_2019.remove("Item")
trace_attrs_bpi_2019.remove("Purchasing Document")

#### Hyperparameter Selection

In [66]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe
from exdpn.guards import ML_Technique

# prepare data
dp_dataset_map_bpi_2019 = extract_all_datasets(log= log_bpi_2019, net=net_bpi_2019, initial_marking=im_bpi_2019, final_marking=fm_bpi_2019, 
                                      event_level_attributes = event_attrs_bpi_2019,
                                      case_level_attributes=trace_attrs_bpi_2019)

# decision points 
dps_bpi_2019 = list(dp_dataset_map_bpi_2019.keys())
total_size_bpi_2019 = sum(len(dp_dataset_map_bpi_2019[key]) for key in dps_bpi_2019)

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [67]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values_dt_bpi_2019 = {}

for dp in dps_bpi_2019:
    dp_dataset = dp_dataset_map_bpi_2019[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values_dt_bpi_2019[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2019[key]) for key,val in best_values_dt_bpi_2019.items())/total_size_bpi_2019 for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["BPI Challenge 2019"] = best_values_dt_bpi_2019

p_4:	0.9948	 {'min_impurity_decrease': 0}
p_8:	0.8236	 {'min_impurity_decrease': 0.01}
p_11:	1.0	 {'min_impurity_decrease': 0}
p_3:	0.9517	 {'min_impurity_decrease': 0}
final value: {'min_impurity_decrease': 0.0024920188992465842}


##### Cross-Validaion Neural Network:

In [68]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10))}

best_values_nn_bpi_2019 = {}
max_ds_size_bpi_2019 = -1
max_ds_key_bpi_2019 = None

for dp in dps_bpi_2019:
    dp_dataset = dp_dataset_map_bpi_2019[dp]
    if len(dp_dataset) > max_ds_size_bpi_2019: max_ds_key_bpi_2019 = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values_nn_bpi_2019[dp] = nn_grid.best_params_

nn_param = {param: best_values_nn_bpi_2019[max_ds_key_bpi_2019][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["BPI Challenge 2019"] = best_values_nn_bpi_2019

p_4:	0.9995	 {'hidden_layer_sizes': (10, 10)}
p_8:	0.8244	 {'hidden_layer_sizes': (5, 5)}
p_11:	0.998	 {'hidden_layer_sizes': (5,)}
p_3:	0.9576	 {'hidden_layer_sizes': (5,)}
final value: {'hidden_layer_sizes': (5,)}


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


##### Cross-Validation Logistic Regression:

In [69]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_lr_bpi_2019 = {}

for dp in dps_bpi_2019:
    dp_dataset = dp_dataset_map_bpi_2019[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values_lr_bpi_2019[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2019[key]) for key,val in best_values_lr_bpi_2019.items())/total_size_bpi_2019 for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["BPI Challenge 2019"] = best_values_lr_bpi_2019

p_4:	1.0	 {'C': 0.1, 'tol': 0.001}
p_8:	0.8452	 {'C': 0.1, 'tol': 0.001}
p_11:	1.0	 {'C': 0.25, 'tol': 0.001}
p_3:	0.9616	 {'C': 0.5, 'tol': 0.001}
final value: {'C': 0.23769952751883539, 'tol': 0.001}


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### Cross-Validation SVM:

In [70]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_svm_bpi_2019 = {}

for dp in dps_bpi_2019:
    dp_dataset = dp_dataset_map_bpi_2019[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values_svm_bpi_2019[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2019[key]) for key,val in best_values_svm_bpi_2019.items())/total_size_bpi_2019 for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["BPI Challenge 2019"] = best_values_svm_bpi_2019

p_4:	0.995	 {'C': 0.1, 'tol': 0.001}
p_8:	0.8273	 {'C': 0.1, 'tol': 0.001}
p_11:	1.0	 {'C': 0.1, 'tol': 0.001}
p_3:	0.9651	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.1, 'tol': 0.001}


##### Cross-Validation XGBoost:

In [71]:
import re
from xgboost import XGBClassifier

parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

# data sets contain special characters which xgb boost cannot handel
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

best_values_xgb_bpi_2019 = {}

for dp in dps_bpi_2019: 
    dp_dataset = dp_dataset_map_bpi_2019[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    # remove special characters
    # solution from: https://stackoverflow.com/questions/48645846/pythons-xgoost-valueerrorfeature-names-may-not-contain-or
    X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values_xgb_bpi_2019[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map_bpi_2019[key]) for key,val in best_values_xgb_bpi_2019.items())/total_size_bpi_2019) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["BPI Challenge 2019"] = best_values_xgb_bpi_2019

p_4:	0.9948	 {'max_depth': 1, 'n_estimators': 20}
p_8:	0.8415	 {'max_depth': 3, 'n_estimators': 150}
p_11:	1.0	 {'max_depth': 1, 'n_estimators': 20}
p_3:	0.965	 {'max_depth': 6, 'n_estimators': 20}
final value: {'max_depth': 3, 'n_estimators': 52}


##### Cross-Validation Random Forest:

In [72]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values_rf_bpi_2019 = {}

for dp in dps_bpi_2019:
    dp_dataset = dp_dataset_map_bpi_2019[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values_rf_bpi_2019[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2019[key]) for key,val in best_values_rf_bpi_2019.items())/total_size_bpi_2019 for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["BPI Challenge 2019"] = best_values_rf_bpi_2019

p_4:	0.9998	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_8:	0.8101	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_11:	1.0	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_3:	0.9494	 {'max_depth': 6, 'min_impurity_decrease': 0}
final value: {'max_depth': 6, 'min_impurity_decrease': 0.0}


------------------------------------------------------------------------------

### BPI Challenge 2017

#### Data Preparation

In [73]:
import pm4py
# source log: https://data.4tu.nl/articles/dataset/BPI_Challenge_2017/12696884/1
log_all_bpi_2017 = pm4py.read_xes('BPI Challenge 2017.xes.gz')

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

In [74]:
print(f'activities: {list(pm4py.get_event_attribute_values(log_all_bpi_2017,"concept:name").keys())}')

activities: ['A_Create Application', 'A_Submitted', 'W_Handle leads', 'W_Complete application', 'A_Concept', 'A_Accepted', 'O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'W_Call after offers', 'A_Complete', 'W_Validate application', 'A_Validating', 'O_Returned', 'W_Call incomplete files', 'A_Incomplete', 'O_Accepted', 'A_Pending', 'A_Denied', 'O_Refused', 'O_Cancelled', 'A_Cancelled', 'O_Sent (online only)', 'W_Assess potential fraud', 'W_Personal Loan collection', 'W_Shortened completion ']


In [75]:
# We only look at subtraces of activities starting with an 'O'.
# Semantically, this means we look at the events corresponding to
# the offer of a trace.
log_o_bpi_2017 = pm4py.filter_event_attribute_values(
    log_all_bpi_2017,
    "concept:name",
    ['O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Returned',
     'O_Accepted', 'O_Cancelled', 'O_Refused', 'O_Sent (online only)'],
    level="event",
    retain=True)

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(log_o_bpi_2017)))

filtered_log_o_bpi_2017 = pm4py.filter_variants(log_o_bpi_2017, [
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Accepted'),
    ('O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Refused'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Refused'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Returned', 'O_Accepted'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Refused'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (online only)', 'O_Returned', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Returned', 'O_Refused')])

print("Number of different trace variants - filtered subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log_o_bpi_2017)))

pm4py.write_xes(filtered_log_o_bpi_2017, "BPIChallenge2017_filtered.xes")

Number of different trace variants - subtraces:  877
Number of different trace variants - filtered subtraces:  10


exporting log, completed traces ::   0%|          | 0/22771 [00:04<?, ?it/s]

#### Analysis

In [76]:
import pm4py
log_bpi_2017 = pm4py.read_xes('BPIChallenge2017_filtered.xes')

parsing log, completed traces ::   0%|          | 0/22771 [00:00<?, ?it/s]

In [77]:
from exdpn.petri_net import get_petri_net
net_bpi_2017, im_bpi_2017, fm_bpi_2017 = get_petri_net(log_bpi_2017, miner_type='IM')

In [78]:
trace_attrs_bpi_2017 = list(pm4py.get_trace_attributes(log_bpi_2017))
event_attrs_bpi_2017 = list(pm4py.get_event_attributes(log_bpi_2017))
event_attrs_bpi_2017 = [attr for attr in event_attrs_bpi_2017 if max(list(pm4py.get_event_attribute_values(log_bpi_2017, attr).values())) != 1 and "ID" not in attr]
event_attrs_bpi_2017.remove("time:timestamp")
event_attrs_bpi_2017.remove("org:resource") 

#### Hyperparameter Selection

In [79]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe

# prepare data
dp_dataset_map_bpi_2017 = extract_all_datasets(log= log_bpi_2017, net=net_bpi_2017, initial_marking=im_bpi_2017, final_marking=fm_bpi_2017, 
                                      event_level_attributes = event_attrs_bpi_2017,
                                      case_level_attributes=trace_attrs_bpi_2017)

# decision points 
dps_bpi_2017 = list(dp_dataset_map_bpi_2017.keys())
total_size_bpi_2017 = sum(len(dp_dataset_map_bpi_2017[key]) for key in dps_bpi_2017)

##### Cross-Validation Decision Tree:

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [80]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values_dt_bpi_2017 = {}

for dp in dps_bpi_2017:
    dp_dataset = dp_dataset_map_bpi_2017[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values_dt_bpi_2017[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2017[key]) for key,val in best_values_dt_bpi_2017.items())/total_size_bpi_2017 for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["BPI Challenge 2017"] = best_values_dt_bpi_2017

A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.


p_4:	0.9819	 {'min_impurity_decrease': 0.01}
p_5:	0.5702	 {'min_impurity_decrease': 0}
p_6:	0.7946	 {'min_impurity_decrease': 0.01}
final value: {'min_impurity_decrease': 0.006666666666666667}


##### Cross-Validation SVM:

In [81]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_svm_bpi_2017 = {}

for dp in dps_bpi_2017:
    dp_dataset = dp_dataset_map_bpi_2017[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values_svm_bpi_2017[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2017[key]) for key,val in best_values_svm_bpi_2017.items())/total_size_bpi_2017 for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["BPI Challenge 2017"] = best_values_svm_bpi_2017

p_4:	0.9819	 {'C': 0.1, 'tol': 0.001}


Liblinear failed to converge, increase the number of iterations.


p_5:	0.5502	 {'C': 0.25, 'tol': 0.001}
p_6:	0.7946	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.15000000000000002, 'tol': 0.001}


##### Cross-Validation Neural Network:

In [82]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10))}

best_values_nn_bpi_2017 = {}
max_ds_size_bpi_2017 = -1
max_ds_key_bpi_2017 = None

for dp in dps_bpi_2017:
    dp_dataset = dp_dataset_map_bpi_2017[dp]
    if len(dp_dataset) > max_ds_size_bpi_2017: max_ds_key_bpi_2017 = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values_nn_bpi_2017[dp] = nn_grid.best_params_

nn_param = {param: best_values_nn_bpi_2017[max_ds_key_bpi_2017][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["BPI Challenge 2017"] = best_values_nn_bpi_2017

p_4:	0.9819	 {'hidden_layer_sizes': (5,)}
p_5:	0.551	 {'hidden_layer_sizes': (5,)}
p_6:	0.7947	 {'hidden_layer_sizes': (5, 5)}
final value: {'hidden_layer_sizes': (5, 5)}


##### Cross-Validation Logistic Regression:

In [83]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values_lr_bpi_2017 = {}

for dp in dps_bpi_2017:
    dp_dataset = dp_dataset_map_bpi_2017[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values_lr_bpi_2017[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2017[key]) for key,val in best_values_lr_bpi_2017.items())/total_size_bpi_2017 for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["BPI Challenge 2017"] = best_values_lr_bpi_2017

p_4:	0.9819	 {'C': 0.1, 'tol': 0.001}


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


p_5:	0.5501	 {'C': 0.1, 'tol': 0.001}
p_6:	0.7946	 {'C': 0.1, 'tol': 0.001}
final value: {'C': 0.09999999999999999, 'tol': 0.001}


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### Cross-Validation XGBoost:

In [84]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

best_values_xgb_bpi_2017 = {}

for dp in dps_bpi_2017:
    dp_dataset = dp_dataset_map_bpi_2017[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values_xgb_bpi_2017[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map_bpi_2017[key]) for key,val in best_values_xgb_bpi_2017.items())/total_size_bpi_2017) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["BPI Challenge 2017"] = best_values_xgb_bpi_2017

p_4:	0.9819	 {'max_depth': 1, 'n_estimators': 20}
p_5:	0.5623	 {'max_depth': 6, 'n_estimators': 150}
p_6:	0.7946	 {'max_depth': 6, 'n_estimators': 150}
final value: {'max_depth': 4, 'n_estimators': 107}


##### Cross-Validation Random Forest:

In [85]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values_rf_bpi_2017 = {}

for dp in dps_bpi_2017:
    dp_dataset = dp_dataset_map_bpi_2017[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values_rf_bpi_2017[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map_bpi_2017[key]) for key,val in best_values_rf_bpi_2017.items())/total_size_bpi_2017 for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["BPI Challenge 2017"] = best_values_rf_bpi_2017

p_4:	0.9819	 {'max_depth': 1, 'min_impurity_decrease': 0}
p_5:	0.5504	 {'max_depth': 6, 'min_impurity_decrease': 0}
p_6:	0.7946	 {'max_depth': 1, 'min_impurity_decrease': 0}
final value: {'max_depth': 3, 'min_impurity_decrease': 0.0}


----------------------------------------------------------------------------------

### Summary

#### Decision Tree:

In [86]:
best_values_dt

{'Road Traffic Fine Management Process': {p_5: {'min_impurity_decrease': 0},
  p_3: {'min_impurity_decrease': 0.01},
  p_12: {'min_impurity_decrease': 0.1},
  p_14: {'min_impurity_decrease': 0},
  p_15: {'min_impurity_decrease': 0},
  p_4: {'min_impurity_decrease': 0.01},
  p_6: {'min_impurity_decrease': 0.01},
  p_7: {'min_impurity_decrease': 0},
  p_9: {'min_impurity_decrease': 0.01}},
 'BPI Challenge 2012': {p_5: {'min_impurity_decrease': 0.01},
  p_6: {'min_impurity_decrease': 0},
  p_4: {'min_impurity_decrease': 0},
  p_7: {'min_impurity_decrease': 0}},
 'BPI Challenge 2019': {p_4: {'min_impurity_decrease': 0},
  p_8: {'min_impurity_decrease': 0.01},
  p_11: {'min_impurity_decrease': 0},
  p_3: {'min_impurity_decrease': 0}},
 'BPI Challenge 2017': {p_4: {'min_impurity_decrease': 0.01},
  p_5: {'min_impurity_decrease': 0},
  p_6: {'min_impurity_decrease': 0.01}}}

#### Logistic Regression:

In [87]:
best_values_lr

{'Road Traffic Fine Management Process': {p_5: {'C': 0.1, 'tol': 0.001},
  p_3: {'C': 0.5, 'tol': 0.001},
  p_12: {'C': 0.1, 'tol': 0.001},
  p_14: {'C': 0.1, 'tol': 0.001},
  p_15: {'C': 0.1, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001},
  p_7: {'C': 0.25, 'tol': 0.001},
  p_9: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2012': {p_5: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_7: {'C': 0.5, 'tol': 0.001}},
 'BPI Challenge 2019': {p_4: {'C': 0.1, 'tol': 0.001},
  p_8: {'C': 0.1, 'tol': 0.001},
  p_11: {'C': 0.25, 'tol': 0.001},
  p_3: {'C': 0.5, 'tol': 0.001}},
 'BPI Challenge 2017': {p_4: {'C': 0.1, 'tol': 0.001},
  p_5: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001}}}

#### SVM:

In [88]:
best_values_svm

{'Road Traffic Fine Management Process': {p_5: {'C': 0.1, 'tol': 0.001},
  p_3: {'C': 0.1, 'tol': 0.001},
  p_12: {'C': 0.1, 'tol': 0.001},
  p_14: {'C': 0.1, 'tol': 0.001},
  p_15: {'C': 0.1, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.5, 'tol': 0.0005},
  p_7: {'C': 0.1, 'tol': 0.001},
  p_9: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2012': {p_5: {'C': 0.1, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001},
  p_4: {'C': 0.1, 'tol': 0.001},
  p_7: {'C': 0.5, 'tol': 0.001}},
 'BPI Challenge 2019': {p_4: {'C': 0.1, 'tol': 0.001},
  p_8: {'C': 0.1, 'tol': 0.001},
  p_11: {'C': 0.1, 'tol': 0.001},
  p_3: {'C': 0.1, 'tol': 0.001}},
 'BPI Challenge 2017': {p_4: {'C': 0.1, 'tol': 0.001},
  p_5: {'C': 0.25, 'tol': 0.001},
  p_6: {'C': 0.1, 'tol': 0.001}}}

#### Neural Network:

In [89]:
best_values_nn

{'Road Traffic Fine Management Process': {p_5: {'hidden_layer_sizes': (5, 5)},
  p_3: {'hidden_layer_sizes': (10, 10)},
  p_12: {'hidden_layer_sizes': (5,)},
  p_14: {'hidden_layer_sizes': (5,)},
  p_15: {'hidden_layer_sizes': (5, 5)},
  p_4: {'hidden_layer_sizes': (5,)},
  p_6: {'hidden_layer_sizes': (10, 10)},
  p_7: {'hidden_layer_sizes': (10, 10)},
  p_9: {'hidden_layer_sizes': (10, 10)}},
 'BPI Challenge 2012': {p_5: {'hidden_layer_sizes': (5, 5)},
  p_6: {'hidden_layer_sizes': (5,)},
  p_4: {'hidden_layer_sizes': (5,)},
  p_7: {'hidden_layer_sizes': (10, 10)}},
 'BPI Challenge 2019': {p_4: {'hidden_layer_sizes': (10, 10)},
  p_8: {'hidden_layer_sizes': (5, 5)},
  p_11: {'hidden_layer_sizes': (5,)},
  p_3: {'hidden_layer_sizes': (5,)}},
 'BPI Challenge 2017': {p_4: {'hidden_layer_sizes': (5,)},
  p_5: {'hidden_layer_sizes': (5,)},
  p_6: {'hidden_layer_sizes': (5, 5)}}}

#### Random Forest:

In [90]:
best_values_rf

{'Road Traffic Fine Management Process': {p_5: {'max_depth': 4,
   'min_impurity_decrease': 0},
  p_3: {'max_depth': 3, 'min_impurity_decrease': 0.01},
  p_12: {'max_depth': 1, 'min_impurity_decrease': 0.1},
  p_14: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_15: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_4: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_6: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_7: {'max_depth': 2, 'min_impurity_decrease': 0},
  p_9: {'max_depth': 1, 'min_impurity_decrease': 0}},
 'BPI Challenge 2012': {p_5: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_6: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_4: {'max_depth': 1, 'min_impurity_decrease': 0},
  p_7: {'max_depth': 6, 'min_impurity_decrease': 0}},
 'BPI Challenge 2019': {p_4: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_8: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_11: {'max_depth': 6, 'min_impurity_decrease': 0},
  p_3: {'max_depth': 6, 'min_impurity_decrease': 0}},


#### XGBoost:

In [91]:
best_values_xgb

{'Road Traffic Fine Management Process': {p_5: {'max_depth': 2,
   'n_estimators': 100},
  p_3: {'max_depth': 1, 'n_estimators': 20},
  p_12: {'max_depth': 1, 'n_estimators': 20},
  p_14: {'max_depth': 1, 'n_estimators': 20},
  p_15: {'max_depth': 1, 'n_estimators': 20},
  p_4: {'max_depth': 1, 'n_estimators': 20},
  p_6: {'max_depth': 1, 'n_estimators': 100},
  p_7: {'max_depth': 1, 'n_estimators': 100},
  p_9: {'max_depth': 1, 'n_estimators': 20}},
 'BPI Challenge 2012': {p_5: {'max_depth': 1, 'n_estimators': 20},
  p_6: {'max_depth': 1, 'n_estimators': 20},
  p_4: {'max_depth': 1, 'n_estimators': 20},
  p_7: {'max_depth': 6, 'n_estimators': 100}},
 'BPI Challenge 2019': {p_4: {'max_depth': 1, 'n_estimators': 20},
  p_8: {'max_depth': 3, 'n_estimators': 150},
  p_11: {'max_depth': 1, 'n_estimators': 20},
  p_3: {'max_depth': 6, 'n_estimators': 20}},
 'BPI Challenge 2017': {p_4: {'max_depth': 1, 'n_estimators': 20},
  p_5: {'max_depth': 6, 'n_estimators': 150},
  p_6: {'max_depth': 6,