### Preprocess logs

In [53]:
# first time to create venv and install packages

# %conda create -n corrid-dev anaconda -y
# %conda activate corrid-dev -y

# %conda install -n corrid-dev pandas -y

# %conda install pip

In [54]:
from sklearn import preprocessing
import numpy
import pandas as pd

logs_folder = './logs/'

def log_pre_process(csv_file_path, memory_reduction, columns_to_drop=[]):
    print(f'log_pre_process: {csv_file_path}')
    data_df = pd.read_csv(csv_file_path, quotechar="'")

    print('Columns before removal:')
    list_of_column_names = list(data_df.columns)
    print(list_of_column_names)

    data_df = data_df.drop(columns=columns_to_drop)

    print('Columns after removal:')
    list_of_column_names = list(data_df.columns)
    print(list_of_column_names)
    print()

    slice_start_col = 0
    slice_end_col = len(data_df.columns)-1

    values = data_df.iloc[:, slice_start_col:slice_end_col]
    classes = data_df.iloc[:, slice_end_col:]

    if memory_reduction:
        enc = preprocessing.OrdinalEncoder(dtype=numpy.int8)
        values = enc.fit_transform(values)

    data = (values, classes)
    return data, list_of_column_names[:-1], data_df


bpic17_logs_with_interventions_path = logs_folder + 'data_bpic17_readyToUse_preprocessed_for_adaptation_classification.csv'
bpic17_logs_columns_to_drop = []
bpic17_logs_with_interventions, bpic17_logs_with_interventions_column_names, bpic17_logs_with_interventions_df = log_pre_process(bpic17_logs_with_interventions_path, memory_reduction=True, columns_to_drop=bpic17_logs_columns_to_drop)

synthetic_logs_with_adaptations_path = logs_folder + 'sample_sequence_simulation_logs_multi_adapted_noisy_encoded.csv'
synthetic_logs_columns_to_drop = ["event:event_name@A","event:event_name@B","event:event_name@C","event:event_name@D","event:event_name@E","event:event_name@F","event:event_name@I","event:event_name@process end event","event:event_name@process start event","event:concept:name@A","event:concept:name@B","event:concept:name@C","event:concept:name@D","event:concept:name@E","event:concept:name@F","event:concept:name@I","event:concept:name@process end event","event:concept:name@process start event","event:adaptation_action@insert","event:adaptation_action@no-action","event:adaptation_action@skip","event:@@index","event:@@case_index","event:trace_id","succession:concept:name@A#B","succession:concept:name@B#C","succession:concept:name@C#D","succession:concept:name@D#E","succession:concept:name@D#F","succession:concept:name@E#F","succession:concept:name@F#process end event","succession:concept:name@process end event#I","succession:concept:name@process start event#A", "event:resource"]
synthetic_logs_with_adaptations, synthetic_logs_with_adaptations_column_names, synthetic_logs_with_adaptations_df = log_pre_process(synthetic_logs_with_adaptations_path, memory_reduction=False, columns_to_drop=synthetic_logs_columns_to_drop)

log_pre_process: ./logs/data_bpic17_readyToUse_preprocessed_for_adaptation_classification.csv
Columns before removal:
['ApplicationType', 'LoanGoal', 'RequestedAmount', 'CreditScore', 'timesincefirstcase', 'duration', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'open_cases', 'month', 'weekday', 'hour', 'treatment']
Columns after removal:
['ApplicationType', 'LoanGoal', 'RequestedAmount', 'CreditScore', 'timesincefirstcase', 'duration', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'open_cases', 'month', 'weekday', 'hour', 'treatment']

log_pre_process: ./logs/sample_sequence_simulation_logs_multi_adapted_noisy_encoded.csv
Columns before removal:
['trace_id', 'event:event_name@A', 'event:event_name@B', 'event:event_name@C', 'event:event_name@D', 'event:event_name@E', 'event:event_name@F', 'event:event_name@I', 'event:event_name@process end event', 'event:event_name@process start event', 'event:concept:name@A', 'event:concept:name

In [55]:
%matplotlib inline



### Classifier comparison

A comparison of a several classifiers in scikit-learn 


In [56]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn import tree
from sklearn.tree import export_text
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

names = [
    # "Nearest Neighbors",
    # "Linear SVM",
    # "RBF SVM",

    # "Gaussian Process",
    "Decision Tree",
    # "Random Forest",
    # "Neural Net",
    # "AdaBoost",
    # "Naive Bayes",
    # "QDA",
]

classifiers = [
    # KNeighborsClassifier(3),
    # SVC(kernel="linear", C=0.025),
    # SVC(gamma=2, C=1),

    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=4, min_samples_leaf=2, ccp_alpha=0.003),
    # RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1),
    # MLPClassifier(alpha=1, max_iter=1000),
    # AdaBoostClassifier(),
    # GaussianNB(),
    # QuadraticDiscriminantAnalysis(),
]

datasets_df = [
    bpic17_logs_with_interventions_df,
    synthetic_logs_with_adaptations_df
]

datasets = [
    bpic17_logs_with_interventions,
    synthetic_logs_with_adaptations
]

dataset_names = [
    "bpic17_logs_with_interventions",
    "synthetic_logs_with_adaptations"
]

dataset_feature_names = [
    bpic17_logs_with_interventions_column_names,
    synthetic_logs_with_adaptations_column_names
]

# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    
    X, y = ds

    print('---------------------------------------------------------------------')
    print(f'dataset: {dataset_names[ds_cnt]}')

    # iterate over classifiers
    for name, clf in zip(names, classifiers):

        print('------------------------------------------------------')
        print('Classifier:', name)

        if name != 'Decision Tree':
            # preprocess dataset, normalize features
            X = StandardScaler().fit_transform(X)

        scores = cross_val_score(clf, X, y.values.ravel(), cv=10, scoring='f1_weighted')
        
        print('10-fold cross-validation mean F1:', numpy.mean(scores))

        if name == 'Decision Tree':
            clf.fit(X, y.values.ravel())
            text_representation = tree.export_text(clf, feature_names=dataset_feature_names[ds_cnt])
            print(text_representation)

---------------------------------------------------------------------
dataset: bpic17_logs_with_interventions
------------------------------------------------------
Classifier: Decision Tree
10-fold cross-validation mean F1: 0.7654498728572802
|--- CreditScore <= 39.50
|   |--- hour <= 13.50
|   |   |--- weekday <= 4.50
|   |   |   |--- class: 1
|   |   |--- weekday >  4.50
|   |   |   |--- class: 0
|   |--- hour >  13.50
|   |   |--- class: 0
|--- CreditScore >  39.50
|   |--- CreditScore <= 40.50
|   |   |--- weekday <= 4.50
|   |   |   |--- hour <= 13.50
|   |   |   |   |--- class: 0
|   |   |   |--- hour >  13.50
|   |   |   |   |--- class: 0
|   |   |--- weekday >  4.50
|   |   |   |--- class: 0
|   |--- CreditScore >  40.50
|   |   |--- class: 1

---------------------------------------------------------------------
dataset: synthetic_logs_with_adaptations
------------------------------------------------------
Classifier: Decision Tree
10-fold cross-validation mean F1: 0.923732418

In [58]:
models_folder = './models/'

def store_classifier(classifier, file_path):
    from joblib import dump
    dump(classifier, models_folder + file_path)

def load_classifier(file_path):
    from joblib import load
    classifier = load(clf, models_folder + file_path)
    return classifier

def store_logs(dataframe, path):
    dataframe.to_csv(logs_folder + path, columns=dataframe.columns, index=False)


def multi_train_test(train_percentages, test_train_ratio):
    print('Multi train / test')
    classifier = DecisionTreeClassifier(max_depth=4, min_samples_leaf=2, ccp_alpha=0.003)

    for ds_cnt, dataset in enumerate(datasets):
        print(f'\n dataset: {dataset_names[ds_cnt]}')
        values, classes = dataset
        # print(values)
        # print(classes)
        dataset_length = len(dataset[0])

        for train_percentage in train_percentages:
            print(f'train percentage= {train_percentage}')
            split_point_train = int(train_percentage / 100 * dataset_length)
            split_point_test = int(split_point_train + test_train_ratio * split_point_train)

            train_set_values = values[:split_point_train]
            train_set_classes = classes[:split_point_train]

            test_set_values = values[split_point_train:split_point_test]
            test_set_classes = classes[split_point_train:split_point_test]

            classifier.fit(train_set_values, train_set_classes)
            y_pred = classifier.predict(test_set_values)
            f1 = f1_score(y_true = test_set_classes, y_pred = y_pred, average = 'weighted')
            print(f'f1 score= {f1}')
            
            classifier_file_path = dataset_names[ds_cnt] + '_' + str(train_percentage) + '_percent' + '.joblib'
            store_classifier(classifier, classifier_file_path)

            train_logs_path = dataset_names[ds_cnt] + '_' + str(train_percentage) + '_percent_train' + '.csv'
            store_logs(datasets_df[ds_cnt][:split_point_train], train_logs_path)

            test_logs_path = dataset_names[ds_cnt] + '_' + str(train_percentage) + '_percent_test' + '.csv'
            store_logs(datasets_df[ds_cnt][split_point_train:split_point_test], test_logs_path)


train_percentages = [20, 40, 66]
test_train_ratio = 0.5
multi_train_test(train_percentages, test_train_ratio)

Multi train / test

 dataset: bpic17_logs_with_interventions
train percentage= 20
f1 score= 0.7850651108672028
train percentage= 40
f1 score= 0.7661996207016786
train percentage= 66
f1 score= 0.7583210230912191

 dataset: synthetic_logs_with_adaptations
train percentage= 20
f1 score= 0.8971681415929202
train percentage= 40
f1 score= 0.9351336055619529
train percentage= 66
f1 score= 0.9288397552860364
