## Hyperparameter Analysis for Event Logs

In [None]:
# set up
best_values_dt =  {}
best_values_lr =  {}
best_values_svm =  {}
best_values_nn =  {}
best_values_rf =  {}
best_values_xgb =  {}

### Road Traffic Fine Management Process

#### Data Preparation 

In [1]:
import pm4py
log_all = pm4py.read_xes('Road_Traffic_Fine_Management_Process.xes.gz')

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 150370/150370 [01:19<00:00, 1884.07it/s]


In [113]:
last_timestamp = max(event["time:timestamp"] for trace in log_all for event in trace) 
print("Last Timestamp:", last_timestamp)
first_timestamp = min(event["time:timestamp"] for trace in log_all for event in trace)
print("First Timestamp:", first_timestamp) 

time_filtered_log = pm4py.filter_time_range(log_all, "2012-01-01 00:00:00", "2013-06-18 00:00:00", mode='traces_contained')

Last Timestamp: 2013-06-18 00:00:00+02:00
First Timestamp: 2000-01-01 00:00:00+01:00


In [114]:
print(f'activities: {list(pm4py.get_event_attribute_values(time_filtered_log,"concept:name").keys())}')
print("Number of different trace variants: ", len(pm4py.get_variants_as_tuples(time_filtered_log)))

activities: ['Create Fine', 'Send Fine', 'Insert Fine Notification', 'Add penalty', 'Send for Credit Collection', 'Payment', 'Insert Date Appeal to Prefecture', 'Send Appeal to Prefecture', 'Receive Result Appeal from Prefecture', 'Appeal to Judge', 'Notify Result Appeal to Offender']
Number of different trace variants:  40


In [115]:
log_start = pm4py.get_start_activities(time_filtered_log)
print(log_start)

end_activities = pm4py.get_end_activities(time_filtered_log)
print(end_activities)

{'Create Fine': 5558}
{'Send Fine': 1351, 'Send for Credit Collection': 169, 'Payment': 3719, 'Receive Result Appeal from Prefecture': 10, 'Send Appeal to Prefecture': 283, 'Notify Result Appeal to Offender': 26}


In [144]:
# We discard less frequent variants in order to reduce
# the number of edge cases.
log = pm4py.filter_variants_top_k(time_filtered_log, 10)
pm4py.write_xes(log, "Road_Traffic_Fine_Management_Process_filtered.xes")
len(log) 

exporting log, completed traces :: 100%|██████████| 5485/5485 [00:01<00:00, 3624.22it/s]


5485

#### Analysis

In [18]:
import pm4py
log = pm4py.read_xes("Road_Traffic_Fine_Management_Process_filtered.xes")

parsing log, completed traces :: 100%|██████████| 5485/5485 [00:06<00:00, 885.80it/s] 


In [19]:
from exdpn.petri_net import get_petri_net
net, im, fm = get_petri_net(log, miner_type='IM')

In [21]:
event_attrs = list(pm4py.get_event_attributes(log))
trace_attrs = list(pm4py.get_trace_attributes(log))
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("org:resource")
numeric_attributes = ["amount", "expense", "totalPaymentAmount", "points"]
trace_attrs = [attr for attr in trace_attrs if max(list(pm4py.get_trace_attribute_values(log, attr).values())) != 1 and "ID" not in attr]

#### Hyperparameter Selection

In [30]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe
from exdpn.guards import ML_Technique

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [31]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    X_train, y_train = [], []
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_['min_impurity_decrease']

min_impurity_decrease = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {min_impurity_decrease}")

best_values_dt["Road Traffic Fine Management Process"] = best_values

p_3:	0.5834	 {'min_impurity_decrease': 0.01}
p_4:	0.9972	 {'min_impurity_decrease': 0.01}
p_7:	1.0	 {'min_impurity_decrease': 0}
p_9:	0.9563	 {'min_impurity_decrease': 0.01}
p_12:	0.9322	 {'min_impurity_decrease': 0.1}
p_14:	0.9486	 {'min_impurity_decrease': 0}
p_15:	1.0	 {'min_impurity_decrease': 0}
p_5:	0.8089	 {'min_impurity_decrease': 0}
p_6:	0.9439	 {'min_impurity_decrease': 0}
final value: 0.020129932243921882


##### Cross-Validation Neural Network

In [32]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10), (5, 10, 5), (10, 20, 10))}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_['hidden_layer_sizes']

hidden_layer_sizes = best_values[max_ds_key]
print(f"final value: {hidden_layer_sizes}")

best_values_nn["Road Traffic Fine Management Process"] = best_values



Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_3:	0.6115	 {'hidden_layer_sizes': (5, 10, 5)}
p_4:	0.9972	 {'hidden_layer_sizes': (10, 20, 10)}
p_7:	0.9951	 {'hidden_layer_sizes': (10, 10)}
p_9:	0.9563	 {'hidden_layer_sizes': (5,)}
p_12:	0.8951	 {'hidden_layer_sizes': (5,)}


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_14:	0.9486	 {'hidden_layer_sizes': (5,)}
p_15:	1.0	 {'hidden_layer_sizes': (10, 10)}


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


p_5:	0.8077	 {'hidden_layer_sizes': (5, 5)}
p_6:	0.9483	 {'hidden_layer_sizes': (10, 20, 10)}
final value: (10, 20, 10)


##### Cross-Validation Logistic Regression:

In [33]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_['C']

C_lr = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {C_lr}")

best_values_lr["Road Traffic Fine Management Process"] = best_values

p_3:	0.5448	 {'C': 0.5}
p_4:	0.9972	 {'C': 0.1}
p_7:	1.0	 {'C': 0.25}
p_9:	0.9563	 {'C': 0.1}
p_12:	0.8904	 {'C': 0.1}
p_14:	0.9486	 {'C': 0.1}
p_15:	1.0	 {'C': 0.1}
p_5:	0.4969	 {'C': 0.1}
p_6:	0.9448	 {'C': 0.1}
final value: 0.19298126743722596


##### Cross-Validation SVM:

In [34]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values[dp] = svm_grid.best_params_['C']

C_svm = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {C_svm}")

best_values_svm["Road Traffic Fine Management Process"] = best_values

p_3:	0.5448	 {'C': 0.1}
p_4:	0.9972	 {'C': 0.1}
p_7:	1.0	 {'C': 0.1}
p_9:	0.9563	 {'C': 0.1}
p_12:	0.8904	 {'C': 0.1}
p_14:	0.9486	 {'C': 0.1}
p_15:	1.0	 {'C': 0.1}
p_5:	0.4969	 {'C': 0.1}
p_6:	0.9444	 {'C': 0.1}
final value: 0.1


##### Cross-Validation XGBoost:

In [35]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 5, 10)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_['max_depth']

max_depth_xgb = round(sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys()))
print(f"final value: {max_depth_xgb}")

best_values_xgb["Road Traffic Fine Management Process"] = best_values

p_3:	0.5559	 {'max_depth': 1}
p_4:	0.9972	 {'max_depth': 1}
p_7:	1.0	 {'max_depth': 1}
p_9:	0.9563	 {'max_depth': 1}
p_12:	0.8904	 {'max_depth': 1}
p_14:	0.9486	 {'max_depth': 1}
p_15:	1.0	 {'max_depth': 1}
p_5:	0.8081	 {'max_depth': 2}
p_6:	0.9474	 {'max_depth': 1}
final value: 1


##### Cross-Validation Random Forest:

In [36]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'n_estimators': (10, 20, 50, 100)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute = True, numeric_attributes = numeric_attributes)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_['n_estimators']

n_estimators = round(sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys()))
print(f"final value: {n_estimators}")

best_values_rf["Road Traffic Fine Management Process"] = best_values

p_3:	0.5502	 {'n_estimators': 100}
p_4:	0.9949	 {'n_estimators': 10}
p_7:	1.0	 {'n_estimators': 10}
p_9:	0.9552	 {'n_estimators': 100}
p_12:	0.8904	 {'n_estimators': 10}
p_14:	0.9486	 {'n_estimators': 10}
p_15:	1.0	 {'n_estimators': 20}
p_5:	0.8089	 {'n_estimators': 10}
p_6:	0.9468	 {'n_estimators': 10}
final value: 33


--------------------------------------------------------------------------------------------------------------------------------------------------------

### BPI Challenge 2012 

#### Data Preparation

In [None]:
import pm4py
log_all = pm4py.read_xes('BPI_Challenge_2012.xes')

In [None]:
print(f'activities: {sorted(list(pm4py.get_event_attribute_values(log_all,"concept:name").keys()))}')

In [None]:
# We only look at subtraces of activities starting with an 'A'.
log = pm4py.filter_event_attribute_values(
    log_all,
    "concept:name",
    ['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'A_REGISTERED', 'A_SUBMITTED'],
    level="event",
    retain=True)

In [None]:
pm4py.write_xes(log, "BPI Challenge 2012 only A.xes")

#### Analysis

In [None]:
import pm4py
log = pm4py.read_xes('BPI Challenge 2012 only A.xes')

In [None]:
event_attrs = list(pm4py.get_event_attributes(log))
trace_attrs = list(pm4py.get_trace_attributes(log))
trace_attrs.remove("REG_DATE")
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("concept:name")

#### Hyperparameter Selection

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())
total_size = sum(len(dp_dataset_map[key]) for key in dps)

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [None]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_

dt_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {dt_param}")

best_values_dt["BPI Challenge 2012"] = best_values

##### Cross-Validation SVM:

In [None]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")

    best_values[dp] = svm_grid.best_params_

svm_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {svm_param}")

best_values_svm["BPI Challenge 2012"] = best_values

##### Cross-Validation Neural Network:

In [None]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10)), 'learning_rate': ('constant', 'invscaling', 'adaptive')}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted', cv=2)

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_

nn_param = {param: best_values[max_ds_key][param] for param in parameters.keys()}
print(f"final value: {nn_param}")

best_values_nn["BPI Challenge 2012"] = best_values

##### Cross-Validation Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5), 'tol': (0.001, 0.0005, 0.0015)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted', cv=2)

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_

lr_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
print(f"final value: {lr_param}")

best_values_lr["BPI Challenge 2012"] = best_values

##### Cross-Validation XGBoost:

In [None]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'n_estimators': (20,100,150)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted', cv=2)

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_

xgb_param = {param: round(sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size) for param in parameters.keys()}
print(f"final value: {xgb_param}")

best_values_xgb["BPI Challenge 2012"] = best_values

##### Cross-Validation Random Forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6), 'min_impurity_decrease': (0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, numeric_attributes=["AMOUNT_REQ"])
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_

rf_param = {param: sum(val[param]*len(dp_dataset_map[key]) for key,val in best_values.items())/total_size for param in parameters.keys()}
rf_param['max_depth'] = round(rf_param['max_depth'])
print(f"final value: {rf_param}")

best_values_rf["BPI Challenge 2012"] = best_values

----------------------------------------------------------------------------------

### BPI Challenge 2019

#### Data Preparation

In [None]:
import pm4py
log_all = pm4py.read_xes('BPI_Challenge_2019.xes')

In [None]:
last_timestamp = max(event["time:timestamp"] for trace in log_all for event in trace)
print("Last Timestamp:", last_timestamp)
first_timestamp = min(event["time:timestamp"] for trace in log_all for event in trace)
print("First Timestamp:", first_timestamp)

time_filtered_log = pm4py.filter_time_range(log_all, "2018-09-01 00:00:00", "2018-12-01 00:00:00", mode='traces_contained')

In [None]:
print(f'activities: {list(pm4py.get_event_attribute_values(time_filtered_log,"concept:name").keys())}')
print("Number of different trace variants: ", len(pm4py.get_variants_as_tuples(time_filtered_log)))

In [None]:
from exdpn.petri_net import get_petri_net
log = pm4py.filter_event_attribute_values(
    time_filtered_log,
    "concept:name",
    ['Create Purchase Order Item', 'Vendor creates invoice', 'Record Goods Receipt', 'Record Invoice Receipt', 
    'Clear Invoice', 'Record Service Entry Sheet', 'Cancel Goods Receipt', 
    'Vendor creates debit memo', 'Cancel Invoice Receipt', 'Change Delivery Indicator', 'Remove Payment Block', 
    'Change Price', 'Delete Purchase Order Item', 'Change Quantity', 
    'Change Final Invoice Indicator', 'Receive Order Confirmation', 'Cancel Subsequent Invoice', 
    'Reactivate Purchase Order Item', 'Update Order Confirmation', 'Block Purchase Order Item', 
    'Change Approval for Purchase Order', 'Release Purchase Order', 'Record Subsequent Invoice', 'Set Payment Block', 
    'Create Purchase Requisition Item', 'Change Storage Location', 'Change Currency', 'Change payment term', 
    'Change Rejection Indicator', 'Release Purchase Requisition'],
    level="event",
    retain=True)

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(log)))



In [None]:
log_start = pm4py.get_start_activities(log)
print(log_start)
filtered_log = pm4py.filter_start_activities(log, 'Create Purchase Order Item')
print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log)))

end_activities = pm4py.get_end_activities(log)
print(end_activities)
filtered_log = pm4py.filter_end_activities(log, ["Clear Invoice"])

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log)))

In [None]:
# We discard less frequent variants in order to reduce
# the number of edge cases.
log = pm4py.filter_variants_top_k(filtered_log, 10)

pm4py.write_xes(log, "BPI_Challenge_2019_filtered_top_k.xes")
len(log)

#### Analysis

In [None]:
import pm4py
log = pm4py.read_xes("BPI_Challenge_2019_filtered_top_k.xes")

In [None]:
from exdpn.petri_net import get_petri_net
net, im, fm = get_petri_net(log, miner_type='IM')

In [None]:
event_attrs = list(pm4py.get_event_attributes(log))
trace_attrs = list(pm4py.get_trace_attributes(log))
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("org:resource")
event_attrs.remove("User")
trace_attrs = [attr for attr in trace_attrs if max(list(pm4py.get_trace_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
trace_attrs.remove("Name")
trace_attrs.remove("Item")
trace_attrs.remove("Purchasing Document")

#### Hyperparameter Selection

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe
from exdpn.guards import ML_Technique

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())

##### Cross-Validation Decision Tree (with respect to explainablility):

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [None]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_['min_impurity_decrease']

min_impurity_decrease = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {min_impurity_decrease}")

best_values_dt["BPI Challenge 2019"] = best_values

##### Cross-Validaion Neural Network:

In [None]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10), (5, 10, 5))}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_['hidden_layer_sizes']

hidden_layer_sizes = best_values[max_ds_key]
print(f"final value: {hidden_layer_sizes}")

best_values_nn["BPI Challenge 2019"] = best_values

##### Cross-Validation Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_['C']

C_lr = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {C_lr}")

best_values_lr["BPI Challenge 2019"] = best_values

##### Cross-Validation SVM:

In [None]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values[dp] = svm_grid.best_params_['C']

C_svm = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {C_svm}")

best_values_svm["BPI Challenge 2019"] = best_values

##### Cross-Validation XGBoost:

In [None]:
import re
from xgboost import XGBClassifier

parameters = {'max_depth': (1, 3, 5, 10)}

# data sets contain special characters which xgb boost cannot handel
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

best_values = {}
for dp in dps: 
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    # remove special characters
    # solution from: https://stackoverflow.com/questions/48645846/pythons-xgoost-valueerrorfeature-names-may-not-contain-or
    X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_['max_depth']

max_depth_xgb = round(sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys()))
print(f"final value: {max_depth_xgb}")

best_values_xgb["BPI Challenge 2019"] = best_values

##### Cross-Validation Random Forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'n_estimators': (10, 20, 50)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset, impute=True)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_['n_estimators']

n_estimators = round(sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys()))
print(f"final value: {n_estimators}")

best_values_rf["BPI Challenge 2019"] = best_values

------------------------------------------------------------------------------

### BPI Challenge 2017

#### Data Preparation

In [None]:
import pm4py
log_all = pm4py.read_xes('BPI Challenge 2017.xes')

In [None]:
print(f'activities: {list(pm4py.get_event_attribute_values(log_all,"concept:name").keys())}')

In [None]:
# We only look at subtraces of activities starting with an 'O'.
# Semantically, this means we look at the events corresponding to
# the offer of a trace.
log = pm4py.filter_event_attribute_values(
    log_all,
    "concept:name",
    ['O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Returned',
     'O_Accepted', 'O_Cancelled', 'O_Refused', 'O_Sent (online only)'],
    level="event",
    retain=True)

print("Number of different trace variants - subtraces: ", len(pm4py.get_variants_as_tuples(log)))

filtered_log = pm4py.filter_variants(log, [
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Accepted'),
    ('O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Refused'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (mail and online)', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (mail and online)', 'O_Returned', 'O_Refused'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Returned', 'O_Accepted'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Refused'),
    ('O_Create Offer', 'O_Created',
     'O_Sent (online only)', 'O_Returned', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Cancelled'),
    ('O_Create Offer', 'O_Created', 'O_Sent (online only)', 'O_Returned', 'O_Refused')])

print("Number of different trace variants - filtered subtraces: ", len(pm4py.get_variants_as_tuples(filtered_log)))

pm4py.write_xes(filtered_log, "BPIChallenge2017_filtered.xes")

In [None]:
# We discard less frequent variants in order to reduce
# the number of edge cases
#log_top_k = pm4py.filter_variants_top_k(filtered_log, 5)
#print(""Number of different trace variants - filtered subtraces top k:", len(pm4py.get_variants_as_tuples(log_top_200)))
#pm4py.write_xes(log_top_k, "BPIChallenge2017_filtered_top_k.xes")

#### Analysis

In [None]:
import pm4py
log = pm4py.read_xes('BPIChallenge2017_filtered.xes')

In [None]:
trace_attrs = list(pm4py.get_trace_attributes(log))
event_attrs = list(pm4py.get_event_attributes(log))
event_attrs = [attr for attr in event_attrs if max(list(pm4py.get_event_attribute_values(log, attr).values())) != 1 and "ID" not in attr]
event_attrs.remove("time:timestamp")
event_attrs.remove("org:resource") 

#### Hyperparameter Selection

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np 
from exdpn.petri_net import get_petri_net
from exdpn.guard_datasets import extract_all_datasets
from exdpn.data_preprocessing import basic_data_preprocessing
from exdpn.data_preprocessing.data_preprocessing import apply_ohe, apply_scaling, fit_scaling, fit_ohe

# prepare data
dp_dataset_map = extract_all_datasets(log= log, net=net, initial_marking=im, final_marking=fm, 
                                      event_level_attributes = event_attrs,
                                      case_level_attributes=trace_attrs)

# decision points 
dps = list(dp_dataset_map.keys())

##### Cross-Validation Decision Tree:

We have observed that decision trees tend to be enormously large without any hyperparameters. We thus try to find an optimal `min_impurity_decrease` value.

In [None]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'min_impurity_decrease':(0, 0.01, 0.05, 0.1, 0.15)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    dt_base = DecisionTreeClassifier()
    dt_grid = GridSearchCV(dt_base, parameters, n_jobs=-1, scoring='f1_weighted')

    dt_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(dt_grid.best_score_,4)}\t {dt_grid.best_params_}")
    best_values[dp] = dt_grid.best_params_['min_impurity_decrease']

min_impurity_decrease = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {min_impurity_decrease}")

best_values_dt["BPI Challenge 2017"] = best_values

Cross-Validation SVM:

In [None]:
from sklearn.svm import LinearSVC
parameters = {'C':(0.1, 0.25, 0.5)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    svm_base = LinearSVC()
    svm_grid = GridSearchCV(svm_base, parameters, n_jobs=-1, scoring='f1_weighted')

    svm_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(svm_grid.best_score_,4)}\t {svm_grid.best_params_}")
    best_values[dp] = svm_grid.best_params_['C']

C_svm = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {C_svm}")

best_values_svm["BPI Challenge 2017"] = best_values

##### Cross-Validation Neural Network:

In [None]:
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes': ((5, ), (5, 5), (10, 10), (5, 10, 5), (10, 20, 10))}

best_values = {}
max_ds_size = -1
max_ds_key = None

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    if len(dp_dataset) > max_ds_size: max_ds_key = dp
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    nn_base = MLPClassifier()
    nn_grid = GridSearchCV(nn_base, parameters, n_jobs=-1, scoring='f1_weighted')

    nn_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(nn_grid.best_score_,4)}\t {nn_grid.best_params_}")
    best_values[dp] = nn_grid.best_params_['hidden_layer_sizes']

hidden_layer_sizes = best_values[max_ds_key]
print(f"final value: {hidden_layer_sizes}")

best_values_nn["BPI Challenge 2017"] = best_values

##### Cross-Validation Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': (0.1, 0.25, 0.5)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    scaler, scaler_columns = fit_scaling(X_train)
    X_train = apply_scaling(X_train, scaler, scaler_columns)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    lr_base = LogisticRegression()
    lr_grid = GridSearchCV(lr_base, parameters, n_jobs=-1, scoring='f1_weighted')

    lr_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(lr_grid.best_score_,4)}\t {lr_grid.best_params_}")
    best_values[dp] = lr_grid.best_params_['C']

C_lr = sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys())
print(f"final value: {C_lr}")

best_values_lr["BPI Challenge 2017"] = best_values

##### Cross-Validation XGBoost:

In [None]:
from xgboost import XGBClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    xgb_base = XGBClassifier()
    xgb_grid = GridSearchCV(xgb_base, parameters, n_jobs=-1, scoring='f1_weighted')

    xgb_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(xgb_grid.best_score_,4)}\t {xgb_grid.best_params_}")
    best_values[dp] = xgb_grid.best_params_['max_depth']

max_depth_xgb = round(sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys()))
print(f"final value: {max_depth_xgb}")

best_values_xgb["BPI Challenge 2017"] = best_values

##### Cross-Validation Random Forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': (1, 2, 3, 4, 6)}

best_values = {}

for dp in dps:
    dp_dataset = dp_dataset_map[dp]
    X_train, y_train = basic_data_preprocessing(dp_dataset)
    ohe = fit_ohe(X_train)
    X_train = apply_ohe(X_train, ohe)
    transition_int_map = {transition: index for index,
                          transition in enumerate(list(set(y_train)))}
    y_train_mapped = [transition_int_map[transition] for transition in y_train]

    rf_base = RandomForestClassifier()
    rf_grid = GridSearchCV(rf_base, parameters, n_jobs=-1, scoring='f1_weighted')

    rf_grid.fit(X_train, y_train_mapped)
    print(f"{dp}:\t{round(rf_grid.best_score_,4)}\t {rf_grid.best_params_}")
    best_values[dp] = rf_grid.best_params_['max_depth']

max_depth_rf = round(sum(val * len(dp_dataset_map[key]) for key,val in best_values.items())/sum(len(dp_dataset_map[key]) for key in best_values.keys()))
print(f"final value: {max_depth_rf}")

best_values_rf["BPI Challenge 2017"] = best_values

----------------------------------------------------------------------------------

### Summary

#### Decision Tree:

In [None]:
best_values_dt

#### Logistic Regression:

In [None]:
best_values_lr

#### SVM:

In [None]:
best_values_svm

#### Neural Network:

In [None]:
best_values_nn

#### Random Forest:

In [None]:
best_values_rf

#### XGBoost:

In [None]:
best_values_xgb