### Initializations for python environment

In [2]:
# first time to create venv and install packages

# %conda create -n corrid-dev anaconda -y
# %conda activate corrid-dev -y

# %conda install -n corrid-dev pandas -y

# %conda install pip

### Preprocess logs: Read, Encode categorical attributes by One Hot Encoder, and Store

In [3]:
# encode logs (string features with OneHotEncoder)
import pandas as pd

logs_folder = './logs/'
bpic17_logs_with_interventions_path = 'data_bpic17_readyToUse_preprocessed_for_adaptation_classification.csv'
bpic17_logs_with_interventions_path_encoded = 'data_bpic17_readyToUse_preprocessed_for_adaptation_classification_encoded.csv'

def encode(dataframe):
    from sklearn import preprocessing
    enc = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)
    return enc.fit_transform(dataframe), enc

def store_logs(dataframe, path):
    dataframe.to_csv(logs_folder + path, columns=dataframe.columns, index=False)

def load_logs(path):
    return pd.read_csv(logs_folder + path, quotechar="'")

print(f'encode: {bpic17_logs_with_interventions_path}')
data_df = load_logs(bpic17_logs_with_interventions_path)

column_names_to_encode = ['ApplicationType', 'LoanGoal',]
columns_to_encode = data_df.loc[:, column_names_to_encode]
encoded_columns, encoder = encode(columns_to_encode)
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

data_df_without_encoded_columns = data_df.drop(column_names_to_encode, axis='columns')
data_encoded_df = encoded_df.join(data_df_without_encoded_columns)
# data_encoded_df = data_encoded_df.reset_index()
print(data_encoded_df)

store_logs(data_encoded_df, bpic17_logs_with_interventions_path_encoded)

encode: data_bpic17_readyToUse_preprocessed_for_adaptation_classification.csv
       ApplicationType_Limit raise  ApplicationType_New credit  LoanGoal_Car  \
0                                0                           1             0   
1                                0                           1             0   
2                                0                           1             0   
3                                0                           1             1   
4                                0                           1             0   
...                            ...                         ...           ...   
31408                            0                           1             1   
31409                            0                           1             0   
31410                            0                           1             0   
31411                            0                           1             1   
31412                            0        

### Load logs & prepare for classifications

In [4]:
import numpy

def log_pre_process(csv_file_path, memory_reduction, columns_to_drop=[]):
    print(f'log_pre_process: {csv_file_path}')
    data_df = load_logs(csv_file_path)

    print('Columns before removal:')
    list_of_column_names = list(data_df.columns)
    print(list_of_column_names)

    data_df = data_df.drop(columns=columns_to_drop)

    print('Columns after removal:')
    list_of_column_names = list(data_df.columns)
    print(list_of_column_names)
    print()

    slice_start_col = 0
    slice_end_col = len(data_df.columns)-1

    values = data_df.iloc[:, slice_start_col:slice_end_col]
    classes = data_df.iloc[:, slice_end_col:]

    # enc = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)
    # values = enc.fit_transform(values[:2])

    # enc2 = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)
    # data_df = enc2.fit_transform(data_df)

    data = (values, classes)

    return data, list_of_column_names[:-1], data_df


bpic17_logs_columns_to_drop = []
bpic17_logs_with_interventions, bpic17_logs_with_interventions_column_names, bpic17_logs_with_interventions_df = log_pre_process(bpic17_logs_with_interventions_path_encoded, memory_reduction=False, columns_to_drop=bpic17_logs_columns_to_drop)

synthetic_logs_with_adaptations_path = 'sample_sequence_simulation_logs_multi_adapted_noisy_encoded.csv'
synthetic_logs_columns_to_drop = ["event:event_name@A","event:event_name@B","event:event_name@C","event:event_name@D","event:event_name@E","event:event_name@F","event:event_name@I","event:event_name@process end event","event:event_name@process start event","event:concept:name@A","event:concept:name@B","event:concept:name@C","event:concept:name@D","event:concept:name@E","event:concept:name@F","event:concept:name@I","event:concept:name@process end event","event:concept:name@process start event","event:adaptation_action@insert","event:adaptation_action@no-action","event:adaptation_action@skip","event:@@index","event:@@case_index","event:trace_id","succession:concept:name@A#B","succession:concept:name@B#C","succession:concept:name@C#D","succession:concept:name@D#E","succession:concept:name@D#F","succession:concept:name@E#F","succession:concept:name@F#process end event","succession:concept:name@process end event#I","succession:concept:name@process start event#A", "event:resource", "event:duration"]
synthetic_logs_with_adaptations, synthetic_logs_with_adaptations_column_names, synthetic_logs_with_adaptations_df = log_pre_process(synthetic_logs_with_adaptations_path, memory_reduction=False, columns_to_drop=synthetic_logs_columns_to_drop)

log_pre_process: data_bpic17_readyToUse_preprocessed_for_adaptation_classification_encoded.csv
Columns before removal:
['ApplicationType_Limit raise', 'ApplicationType_New credit', 'LoanGoal_Car', 'LoanGoal_Caravan / Camper', 'LoanGoal_Existing loan takeover', 'LoanGoal_Extra spending limit', 'LoanGoal_Home improvement', 'LoanGoal_Motorcycle', 'LoanGoal_Not speficied', 'LoanGoal_Other see explanation', 'LoanGoal_Remaining debt home', 'LoanGoal_Unknown', 'LoanGoal_other', 'RequestedAmount', 'CreditScore', 'timesincefirstcase', 'duration', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'open_cases', 'month', 'weekday', 'hour', 'treatment']
Columns after removal:
['ApplicationType_Limit raise', 'ApplicationType_New credit', 'LoanGoal_Car', 'LoanGoal_Caravan / Camper', 'LoanGoal_Existing loan takeover', 'LoanGoal_Extra spending limit', 'LoanGoal_Home improvement', 'LoanGoal_Motorcycle', 'LoanGoal_Not speficied', 'LoanGoal_Other see explanation', 'LoanGoal_Remaini

In [5]:
%matplotlib inline



### Classifier comparison (now just Decision Tree and print the tree model)

In [8]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn import tree
from sklearn.tree import export_text
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

outputs_folder = './outputs/'
decision_tree_output_file_name = 'output_decision_trees.txt'
multi_classification_output_file_name = 'output_multiple_classification.txt'

classifier_names = [
    "Nearest Neighbors",
    # "Linear SVM",
    # "RBF SVM",
    # "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net (Multi-layer Perceptron)",
    "AdaBoost",
    "Gaussian Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    # SVC(kernel="linear", C=0.025),
    # SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=4, min_samples_leaf=2, ccp_alpha=0.003),
    RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

datasets_df = [
    bpic17_logs_with_interventions_df,
    synthetic_logs_with_adaptations_df
]

datasets = [
    bpic17_logs_with_interventions,
    synthetic_logs_with_adaptations
]

dataset_names = [
    "bpic17_logs_with_interventions",
    "synthetic_logs_with_adaptations"
]

dataset_feature_names = [
    bpic17_logs_with_interventions_column_names,
    synthetic_logs_with_adaptations_column_names
]

multi_classification_output_file = open(file=outputs_folder + multi_classification_output_file_name, mode='w')
decision_tree_output_file = open(file=outputs_folder + decision_tree_output_file_name, mode='w')

# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    
    X, y = ds

    out_text = f'Dataset: {dataset_names[ds_cnt]}\n\n'
    print(out_text)
    multi_classification_output_file.write(out_text)
    decision_tree_output_file.write(out_text)

    # iterate over classifiers
    for classifier_name, classifier in zip(classifier_names, classifiers):
        
        out_text_classifier = f'Classifier: {classifier_name}'
        out_text_parameters = f'\nParameters: {classifier.get_params()}'

        print(out_text_classifier)
        multi_classification_output_file.write(out_text_classifier + out_text_parameters)


        scores = cross_val_score(classifier, X, y.values.ravel(), cv=10, scoring='f1_weighted')
        
        out_text = f'\n10-fold cross-validation mean weighted F1: {numpy.mean(scores)}\n\n'

        print(out_text)
        multi_classification_output_file.write(out_text)

        if classifier_name == 'Decision Tree':
            classifier.fit(X, y.values.ravel())
            text_representation = tree.export_text(classifier, feature_names=dataset_feature_names[ds_cnt])
            decision_tree_output_file.write(out_text_classifier + out_text_parameters)
            out_text_tree = f'\n\nDecision Tree on all training data:\n{text_representation}\n\n'
            decision_tree_output_file.write(out_text_tree)

multi_classification_output_file.close()
decision_tree_output_file.close()

Dataset: bpic17_logs_with_interventions


Classifier: Nearest Neighbors

10-fold cross-validation mean weighted F1: 0.5369588705722823


Classifier: Decision Tree

10-fold cross-validation mean weighted F1: 0.7843151284101866


Classifier: Random Forest

10-fold cross-validation mean weighted F1: 0.6578081863941145


Classifier: Neural Net (Multi-layer Perceptron)

10-fold cross-validation mean weighted F1: 0.43274943807587984


Classifier: AdaBoost

10-fold cross-validation mean weighted F1: 0.6534803446805135


Classifier: Gaussian Naive Bayes

10-fold cross-validation mean weighted F1: 0.6559218882855672


Classifier: QDA





10-fold cross-validation mean weighted F1: 0.43542015153410907


Dataset: synthetic_logs_with_adaptations


Classifier: Nearest Neighbors

10-fold cross-validation mean weighted F1: 0.7586216957631601


Classifier: Decision Tree

10-fold cross-validation mean weighted F1: 0.9237324182574183


Classifier: Random Forest

10-fold cross-validation mean weighted F1: 0.8789572865722868


Classifier: Neural Net (Multi-layer Perceptron)

10-fold cross-validation mean weighted F1: 0.5948695394413764


Classifier: AdaBoost

10-fold cross-validation mean weighted F1: 0.8264897957330447


Classifier: Gaussian Naive Bayes

10-fold cross-validation mean weighted F1: 0.75808712722691


Classifier: QDA

10-fold cross-validation mean weighted F1: 0.7580498361026952


