In [45]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import lightgbm as lgb


In [14]:
# read BPIC12.csv and Add label
# label = {'Activity' : 'O_ACCEPTED-COMPLETE'}

df = pd.read_csv('BPIC12.csv')
df = df[['Case ID', 'Activity', 'Resource', 'Complete Timestamp']]
df['label'] = ""
df.loc[df['Activity'] == 'O_ACCEPTED-COMPLETE', 'label'] = '1'
df.loc[df['Activity'] != 'O_ACCEPTED-COMPLETE', 'label'] = '0'
df

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,label
0,173688,A_SUBMITTED-COMPLETE,112.0,2011/10/01 07:38:44.546,0
1,173688,A_PARTLYSUBMITTED-COMPLETE,112.0,2011/10/01 07:38:44.880,0
2,173688,A_PREACCEPTED-COMPLETE,112.0,2011/10/01 07:39:37.906,0
3,173688,W_Completeren aanvraag-SCHEDULE,112.0,2011/10/01 07:39:38.875,0
4,173688,W_Completeren aanvraag-START,,2011/10/01 18:36:46.437,0
...,...,...,...,...,...
262195,214376,A_PARTLYSUBMITTED-COMPLETE,112.0,2012/03/01 07:51:17.423,0
262196,214376,W_Afhandelen leads-SCHEDULE,112.0,2012/03/01 07:52:01.287,0
262197,214376,W_Afhandelen leads-START,11169.0,2012/03/01 17:26:46.736,0
262198,214376,A_DECLINED-COMPLETE,11169.0,2012/03/01 17:27:37.118,0


In [17]:
# error check
count = len(df[df['label'] == '1'])
print(count)

2243


In [3]:
# dict : drop_act, bucketing, encoding, model

# drop_act = {2, 4, 6, 8}
# bucketing = {1, 2*mean_trace_length}
# encoding = {'aggregate', 'index'}
# model = {'DT', 'RF', 'XGB', 'LGBM'}

parameter = {'drop_act':[2,4,6,8], 
    'bucketing':[1, 2*'mean_trace_length'], # need to modify
    'encoding':['aggregate', 'index'],
    'model':['DT', 'RF', 'XGB', 'LGBM']}

In [4]:
# train test split

X = df.drop('label', axis=1)
y = df['label']

# one-hot encoding
# train and test set for decision tree
df_train = pd.get_dummies(data = X, columns = ['Activity'], prefix = 'Activity')
df_train['Complete Timestamp'] = pd.to_datetime(df_train['Complete Timestamp'], format='%Y/%m/%d %H:%M:%S.%f')

# datetime to float for DT
df_train['Complete Timestamp'] = (df_train['Complete Timestamp'] - df_train['Complete Timestamp'].min()) / np.timedelta64(1,'D')
df_train

# remove NaN for DT
df_train = df_train.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, random_state=2023)

In [7]:
# machine learning
# DT
clf_dt = tree.DecisionTreeClassifier()
clf_dt = clf_dt.fit(df_train, y)

# Make predictions on the test data
predictions = clf_dt.predict(X_test)

# Evaluate the model's performance
accuracy = clf_dt.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [6]:
# RF

# Create a Random Forest classifier
clf_rf = RandomForestClassifier()

# Train the classifier on the training data
clf_rf.fit(X_train, y_train)

# Make predictions on the test data
predictions = clf_rf.predict(X_test)

# Evaluate the model's performance
accuracy = clf_rf.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [37]:
# XGB
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

In [39]:
xgb = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=4)
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, random_state=2023)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [44]:
accuracy = accuracy_score(y_test, xgb_pred)
precision = precision_score(y_test, xgb_pred)
recall = recall_score(y_test, xgb_pred)
f1 = f1_score(y_test, xgb_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [51]:
# LGBM

lgb_clf = lgb.LGBMClassifier(num_leaves=31, objective='binary')
lgb_clf.fit(X_train, y_train)
y_pred = lgb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0