In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import time
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import permutations
import json, os
from collections import OrderedDict
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb

from matplotlib import pyplot

In [2]:
# pd.set_option('display.max_columns', None)

df = pd.read_csv('../../BPIC_Classification/processed_dataset/P_indexbased_one_17.csv')
df.head()

Unnamed: 0,RequestedAmount_0,FirstWithdrawalAmount_0,MonthlyCost_0,NumberOfTerms_0,OfferedAmount_0,CreditScore_0,timesincelastevent_0,timesincecasestart_0,timesincemidnight_0,event_nr_0,...,Selected_7_True,Selected_7_missing,Selected_8_False,Selected_8_True,Selected_8_missing,Selected_9_False,Selected_9_True,Selected_9_missing,time:timestamp,label
0,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,957,1,...,0,1,0,0,1,1,0,0,2016-08-09 12:44:23.521545472,1
1,12500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,614,1,...,0,1,0,1,0,0,1,0,2016-06-06 15:13:00.324519936,0
2,45000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,956,1,...,0,0,1,0,0,1,0,0,2016-04-10 04:53:30.241722112,1
3,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,999,1,...,0,1,0,0,1,0,0,1,2016-09-19 18:51:39.501400064,1
4,37500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,777,1,...,0,1,0,1,0,0,1,0,2016-03-24 10:18:13.238510080,0


In [3]:
df['label'].value_counts()

1    18621
0    12792
Name: label, dtype: int64

In [4]:
class AccuracyScore:

    def __init__(self, X_train, y_train, X_test, y_test):
        self.self = self
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def preprocessing(self):
        self['time:timestamp'] = pd.to_datetime(self['time:timestamp'], format='%Y/%m/%d %H:%M:%S.%f')
        self['time:timestamp'] = (self['time:timestamp'] - self['time:timestamp'].min()) / np.timedelta64(1,'M')
        
        return self
    
    def column_uniquify(self):
        cols = pd.Series(self.columns)
        dup_count = cols.value_counts()
        for dup in cols[cols.duplicated()].unique():
            cols[cols[cols == dup].index.values.tolist()] = [dup + str(i) for i in range(1, dup_count[dup]+1)]
        self.columns = cols
        return self.columns

    def data_split(self):
        X = df.drop('label', axis=1)
        y = df['label']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023, shuffle=True)
        
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        
        return X_train, y_train, X_test, y_test        
    
    def imbalance(self, X_train, y_train):
        sm = SMOTE(random_state=1)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
        return X_train_res, y_train_res
    
    def DT(self, X_train, y_train, X_test, y_test):
        clf_dt = tree.DecisionTreeClassifier(max_depth=2, random_state=2023, max_features='log2')
        clf_dt = clf_dt.fit(X_train, y_train)
        predictions = clf_dt.predict(X_test)
        accuracy = clf_dt.score(X_test, y_test)
        print('DT Accuracy:', accuracy)
        print('DT AUC Score {}'.format(roc_auc_score(y_test, predictions)))

    def RF(self, X_train, y_train, X_test, y_test):
        clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=2023)
        clf_rf = clf_rf.fit(X_train, y_train)
        predictions = clf_rf.predict(X_test)
        accuracy = clf_rf.score(X_test, y_test)
        print('RF Accuracy:', accuracy)
        print('RF AUC Score {}'.format(roc_auc_score(y_test, predictions)))
    
    def XGB(self, X_train, y_train, X_test, y_test):
        xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=2023)
        xgb.fit(X_train, y_train)
        xgb_pred = xgb.predict(X_test)

        accuracy = accuracy_score(y_test, xgb_pred)
        precision = precision_score(y_test, xgb_pred)
        recall = recall_score(y_test, xgb_pred)
        f1 = f1_score(y_test, xgb_pred)

        print('XGB Accuracy:', accuracy)
        print('XGB AUC Score {}'.format(roc_auc_score(y_test, xgb_pred)))
    
    def LGBM(self, X_train, y_train, X_test, y_test):
        clf_lgbm = LGBMClassifier(random_state=2023)
        clf_lgbm = clf_lgbm.fit(X_train, y_train)
        predictions = clf_lgbm.predict(X_test)
        accuracy = clf_lgbm.score(X_test, y_test)
        print('LGBM Accuracy:', accuracy)
        print('LGBM AUC Score {}'.format(roc_auc_score(y_test, predictions)))

In [5]:
df = AccuracyScore.preprocessing(df)
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df.columns = AccuracyScore.column_uniquify(df)
X_train, y_train, X_test, y_test = AccuracyScore.data_split(df)
# X_train, y_train = AccuracyScore.imbalance(df, X_train, y_train)

In [6]:
AccuracyScore.DT(df, X_train, y_train, X_test, y_test)
AccuracyScore.RF(df, X_train, y_train, X_test, y_test)
AccuracyScore.XGB(df, X_train, y_train, X_test, y_test)
AccuracyScore.LGBM(df, X_train, y_train, X_test, y_test)

DT Accuracy: 0.5999575551782682
DT AUC Score 0.5
RF Accuracy: 0.7583828522920204
RF AUC Score 0.7084843574512164
XGB Accuracy: 0.8642826825127334
XGB AUC Score 0.8434082957160912
LGBM Accuracy: 0.8612054329371817
LGBM AUC Score 0.8415950211066272
