In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from scipy.sparse import csr_matrix
from sklearn.svm import SVR
from tqdm import tqdm 
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', None)

In [3]:
path = r"C:\Users\boris\OneDrive\Bureaublad\Datascience and Entrepreneurship\Year 2\Semester 1\Real-Time process mining\prefix data"
df_train = pd.read_csv(path + "\prefix_approved_train.csv").fillna(0)
df_test = pd.read_csv(path + "\prefix_cancelled_test.csv").fillna(0)

In [4]:
def feature_engineering(df_train):
    df_train['first_timestamp'] = pd.to_datetime(df_train['first_timestamp'])

    # day
    df_train['Day'] = df_train['first_timestamp'].dt.day
    # month
    df_train['Month'] = df_train['first_timestamp'].dt.month
    # hour
    df_train['Start_hour'] = df_train['first_timestamp'].dt.hour
    # minute
    df_train['Start_minute'] = df_train['first_timestamp'].dt.minute
    # second
    df_train['Start_second'] = df_train['first_timestamp'].dt.second
    # Monday is 0 and Sunday is 6
    df_train['Start_weekday'] = df_train['first_timestamp'].dt.weekday
    # week of the year
    df_train['Start_week_of_year'] = df_train['first_timestamp'].dt.week

    df_train.drop('first_timestamp', inplace=True, axis=1)

    return df_train

In [32]:
def bucketed_testing(pipe, X_train, y_train, X_test, y_test, progression):
    X_train = X_train[X_train['case_progression'] <= progression].drop(columns = ["case_progression"])
    y_train = y_train[y_train.index.isin(X_train.index)]
    
    X_test = X_test[X_test['case_progression'] <= progression].drop(columns = ["case_progression"])
    y_test = y_test[y_test.index.isin(X_test.index)]


    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)

    score = r2_score(y_pred=y_pred, y_true=y_test)

    return score

In [6]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

  df_train['Start_week_of_year'] = df_train['first_timestamp'].dt.week
  df_train['Start_week_of_year'] = df_train['first_timestamp'].dt.week


In [27]:
col_dict = {}
dtypes_list = dict(df_train.dtypes)
for pair in dtypes_list.items():
    key = str(pair[1])
    col_dict[key] = col_dict.get(key, []) + [pair[0]]

col_dict["object"].pop()
col_dict['object']

['Accepted', 'Selected', 'case:LoanGoal', 'case:ApplicationType']

In [19]:
X_train = df_train.drop(columns=['trace_duration', "case_outcome", "case_progression"], inplace=False)
y_train = df_train['trace_duration']

X_test = df_test.drop(columns=['trace_duration', "case_outcome", "case_progression"], inplace=False)
y_test = df_test['trace_duration']

----- First algorithm -----

In [28]:
categorical_pipe = make_pipeline(OneHotEncoder())

# Using categorical pipe for features A,B,C, numeric pipe otherwise
preprocessor = make_column_transformer((categorical_pipe, col_dict['object']), remainder='passthrough')

# Combine with learning algorithm in another pipeline

pipe_rf = make_pipeline(preprocessor, StandardScaler(), RandomForestRegressor())
pipe_rf.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('onehotencoder',
                                                   OneHotEncoder())]),
                                  ['Accepted', 'Selected', 'case:LoanGoal',
                                   'case:ApplicationType'])]),
 'standardscaler': StandardScaler(),
 'randomforestregressor': RandomForestRegressor()}

In [40]:
n_trees = np.arange(50, 500, 50)
max_features = ["sqrt", "log2", 1]

gridsearch_dict_forrest = {}

for n in tqdm(n_trees):
    for method in tqdm(max_features):
        pipe_rf.set_params(randomforestregressor__n_estimators = n, randomforestregressor__max_features = method)
        pipe_rf.fit(X_train, y_train)
        y_pred_rf = pipe_rf.predict(X_test)
        score = r2_score(y_pred=y_pred_rf, y_true=y_test)
        gridsearch_dict_forrest[(n, method)] = score

100%|██████████| 3/3 [01:01<00:00, 20.39s/it]
100%|██████████| 3/3 [01:42<00:00, 34.11s/it]
100%|██████████| 3/3 [02:50<00:00, 56.90s/it]
100%|██████████| 3/3 [03:38<00:00, 72.80s/it]]
100%|██████████| 3/3 [04:48<00:00, 96.29s/it]]
100%|██████████| 3/3 [06:03<00:00, 121.32s/it]
100%|██████████| 3/3 [06:49<00:00, 136.44s/it]
100%|██████████| 3/3 [08:00<00:00, 160.16s/it]
100%|██████████| 3/3 [12:28<00:00, 249.60s/it]
100%|██████████| 9/9 [47:24<00:00, 316.01s/it]


In [41]:
print('RANDOM FOREST REGRESSOR')
print("best parameters were " , max(gridsearch_dict_forrest, key=gridsearch_dict_forrest.get), ' with an R2 of ', max(gridsearch_dict_forrest.values()))

RANDOM FOREST REGRESSOR
best parameters were  (250, 'sqrt')  with an R2 of  0.601626117630129


In [21]:
pipe_rf.set_params(randomforestregressor__n_estimators = 450, randomforestregressor__max_features = "sqrt")
pipe_rf.fit(X_train, y_train)
y_pred_rf = pipe_rf.predict(X_test)

In [22]:
mean_abs_error_rf = (mean_absolute_error(y_true = y_test, y_pred = y_pred_rf) / (60*60*24))
R2_rf = r2_score(y_pred = y_pred_rf, y_true = y_test)
print("means absolute error is: ", mean_abs_error_rf, " days", "\n\n", "R2 score is: ", R2_rf)

means absolute error is:  2.5540535196788494  days 

 R2 score is:  0.59577885914441


In [25]:
#initialize again to include case progression which is filtered on in the bucketed testing function
X_train = df_train.drop(columns=['trace_duration', "case_outcome"], inplace=False)
y_train = df_train['trace_duration']

X_test = df_test.drop(columns=['trace_duration', "case_outcome"], inplace=False)
y_test = df_test['trace_duration']

In [33]:
import numpy as np

pipe_rf.set_params(randomforestregressor__n_estimators = 250, randomforestregressor__max_features = "sqrt")
prog = np.arange(0.0, 1, 0.05)

# store the results for different min_sim
res_prog_rf = {}
for n in prog:

    res_prog_rf[n] = bucketed_testing(pipe_rf, X_train, y_train, X_test, y_test, n)

KeyboardInterrupt: 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

df_progression_results_rf = pd.DataFrame.from_dict(res_prog_rf, orient='index').reset_index().rename(
    columns={"index": "case_progression", 0: "R2"})

sns.lineplot(
    data = df_progression_results_rf,
    x = "case_progression", y = "R2", markers=True, dashes=False
).set(title = "Predictive performanc over case duration")

------ Second algorithm -----

In [7]:
categorical_pipe = make_pipeline(OneHotEncoder())

# Using categorical pipe for features A,B,C, numeric pipe otherwise
preprocessor = make_column_transformer((categorical_pipe, col_dict['object']), remainder='passthrough')

# Combine with learning algorithm in another pipeline

pipe_svr = make_pipeline(preprocessor, StandardScaler(), SVR(cache_size = 1000))
pipe_svr.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('onehotencoder',
                                                   OneHotEncoder())]),
                                  ['Accepted', 'Selected', 'case:LoanGoal',
                                   'case:ApplicationType'])]),
 'standardscaler': StandardScaler(),
 'svr': SVR(cache_size=1000)}

In [8]:
regularization = np.arange(0.5, 1, 0.1)
kernels = ["poly", "rbf", "sigmoid"]

gridsearch_dict_SVR = {} 

for penalty in tqdm(regularization):
    for kernel in tqdm(kernels):
        pipe_svr.set_params(svr__C = penalty, svr__kernel = kernel)
        pipe_svr.fit(X_train, y_train)
        y_pred_svr = pipe_svr.predict(X_test)
        score = r2_score(y_pred=y_pred_svr, y_true=y_test)
        gridsearch_dict_SVR[(penalty, kernel)] = score

100%|██████████| 3/3 [3:52:02<00:00, 4640.73s/it]
100%|██████████| 3/3 [3:43:03<00:00, 4461.25s/it]/it]
100%|██████████| 3/3 [3:43:12<00:00, 4464.21s/it]/it]
100%|██████████| 3/3 [3:43:00<00:00, 4460.05s/it]/it]
100%|██████████| 3/3 [3:00:20<00:00, 3606.88s/it]/it]
100%|██████████| 5/5 [18:01:39<00:00, 12979.88s/it]  


In [11]:
gridsearch_dict_SVR

{(0.5, 'poly'): -0.08825349743945732,
 (0.5, 'rbf'): -0.0884056182880113,
 (0.5, 'sigmoid'): -0.08706205018329505,
 (0.6, 'poly'): -0.09217156864682208,
 (0.6, 'rbf'): -0.08783426656281623,
 (0.6, 'sigmoid'): -0.086241562606167,
 (0.7, 'poly'): -0.09757957065037481,
 (0.7, 'rbf'): -0.08728515614193477,
 (0.7, 'sigmoid'): -0.08541347035376567,
 (0.7999999999999999, 'poly'): -0.1075448952106215,
 (0.7999999999999999, 'rbf'): -0.08674882910251913,
 (0.7999999999999999, 'sigmoid'): -0.08460601734522033,
 (0.8999999999999999, 'poly'): -0.11950028905348242,
 (0.8999999999999999, 'rbf'): -0.08618529151780452,
 (0.8999999999999999, 'sigmoid'): -0.08380953517085321}

In [9]:
print('SUPPORT VECTOR REGRESSOR')
print("best parameters were " , max(gridsearch_dict_SVR, key=gridsearch_dict_SVR.get), ' with an R2 of ', max(gridsearch_dict_SVR.values()))

SUPPORT VECTOR REGRESSOR
best parameters were  (0.8999999999999999, 'sigmoid')  with an R2 of  -0.08380953517085321


In [None]:
pipe_svr.set_params(svr__C = 0.9, svr__kernel = "sigmoid", svr__cache_size = 2000)
pipe_svr.fit(X_train, y_train)
y_pred_svr = pipe_svr.predict(X_test)

In [None]:
mean_abs_error_svr = (mean_absolute_error(y_true = y_test, y_pred = y_pred_svr) / (60*60*24))
R2_svr = r2_score(y_pred = y_pred_svr, y_true = y_test)
print("means absolute error is: ", mean_abs_error_svr, " days", "\n\n", "R2 score is: ", R2_svr)

In [None]:
#initialize again to include case progression which is filtered on in the bucketed testing function
X_train = df_train.drop(columns=['trace_duration', "case_outcome"], inplace=False)
y_train = df_train['trace_duration']

X_test = df_test.drop(columns=['trace_duration', "case_outcome"], inplace=False)
y_test = df_test['trace_duration']

In [None]:
import numpy as np

pipe_svr.set_params(svr__C = 0.9, svr__kernel = "sigmoid")
prog = np.arange(0.0, 1, 0.05)

# store the results for different min_sim
res_prog_svr = {}
for n in tqdm(prog):

    res_prog_svr[n] = bucketed_testing(pipe_svr, X_train, y_train, X_test, y_test, n)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

df_progression_results_svr = pd.DataFrame.from_dict(res_prog_svr, orient='index').reset_index().rename(
    columns={"index": "case_progression", 0: "R2"})

sns.lineplot(
    data = df_progression_results_svr,
    x = "case_progression", y = "R2", markers=True, dashes=False
).set(title = "Predictive performanc over case duration")