In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## Dataset

In [3]:
df_processed = pd.read_csv("data_ml.csv", index_col = 0)
df_processed.head()

Unnamed: 0_level_0,Customer Lifetime Value,Response,Coverage,Education,Gender,Income,Monthly Premium Auto,Number of Policies,Total Claim Amount,Vehicle Size,...,Sales Channel_Agent,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,2763.519279,0,0,2,1,56274,69,1,384.811147,1,...,1,0,0,0,0,0,0,0,0,1
QZ44356,6979.535903,0,1,2,1,0,94,8,1131.464935,1,...,1,0,0,0,1,0,0,0,0,0
AI49188,12887.43165,0,2,2,1,48767,108,2,566.472247,1,...,1,0,0,0,0,0,0,0,0,1
WW63253,7645.861827,0,0,2,0,0,106,7,529.881344,1,...,0,0,1,0,0,0,0,1,0,0
HB64268,2813.692575,0,0,2,0,43836,73,1,138.130879,1,...,1,0,0,0,1,0,0,0,0,0


In [144]:
df_processed.columns

Index(['Customer Lifetime Value', 'Response', 'Coverage', 'Education',
       'Gender', 'Income', 'Monthly Premium Auto', 'Number of Policies',
       'Total Claim Amount', 'Vehicle Size', 'State_Arizona',
       'State_California', 'State_Nevada', 'State_Oregon', 'State_Washington',
       'EmploymentStatus_Disabled', 'EmploymentStatus_Employed',
       'EmploymentStatus_Medical Leave', 'EmploymentStatus_Retired',
       'EmploymentStatus_Unemployed', 'Location Code_Rural',
       'Location Code_Suburban', 'Location Code_Urban',
       'Marital Status_Divorced', 'Marital Status_Married',
       'Marital Status_Single', 'Policy_Corporate L1', 'Policy_Corporate L2',
       'Policy_Corporate L3', 'Policy_Personal L1', 'Policy_Personal L2',
       'Policy_Personal L3', 'Policy_Special L1', 'Policy_Special L2',
       'Policy_Special L3', 'Renew Offer Type_Offer1',
       'Renew Offer Type_Offer2', 'Renew Offer Type_Offer3',
       'Renew Offer Type_Offer4', 'Sales Channel_Agent',
       '

# Splitting Data

In [4]:
x = df_processed.drop(columns = ['Response'] , axis = 1)
y = df_processed['Response']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, stratify= y, random_state = 42)

# Handling Imbalance Data

In [6]:
y_train.value_counts()

0    6194
1    1042
Name: Response, dtype: int64

#### Random Over Sampling

In [7]:
df_train = pd.concat([x_train, y_train], axis=1)
not_renewal = df_train[df_train['Response'] == 0]
renewal = df_train[df_train['Response'] == 1]

renewal_oversample = resample(renewal, replace=True, n_samples = len(not_renewal), random_state = 42)
df_OverSampled = pd.concat([not_renewal, renewal_oversample])
df_OverSampled['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [8]:
x_train_os = df_OverSampled.drop(columns = ['Response'])
y_train_os = df_OverSampled['Response']

#### Smote

In [9]:
sm = SMOTE(random_state=42)
x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

x = pd.DataFrame(data = x_train_sm, columns = x_train.columns)
y = pd.DataFrame(data = y_train_sm, columns = ['Response'])
df_smote = x.join(y)
# df_smote = pd.concat([x_train_sm, y_train_sm], axis = 1)
df_smote['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [10]:
columns_continuous = ['Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Policies', 'Total Claim Amount']

In [11]:
std_scale = StandardScaler()
mm_scale = MinMaxScaler()
rb_scale = RobustScaler()

# Tuning Model

## SVM

In [12]:
svm = SVC(max_iter = 300)

In [13]:
param_svm = {'C' : np.arange(0.1, 10),
             'kernel' : ['linear', 'rbf'],
             "gamma" : np.arange(0.01, 10)}

#### Random Over Sampling without Scaling

In [14]:
svm_os = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
svm_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.8min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [15]:
svm_os_tuned = svm_os.best_estimator_
pred_train_os = svm_os_tuned.predict(x_train_os)
pred_test_os = svm_os_tuned.predict(x_test)
svm_os_tuned

SVC(C=0.1, gamma=0.01, max_iter=300)

In [16]:
svm_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
svm_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
svm_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
svm_recall_tuned_test_os = recall_score(y_test, pred_test_os)
svm_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
svm_prec_tuned_test_os = precision_score(y_test, pred_test_os)
svm_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
svm_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [17]:
cm_svm_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_svm_tuned_os = pd.DataFrame(data=cm_svm_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,260,0
Akt 0,1,1548


In [18]:
tp_svm_os = cm_svm_tuned_os['Pred 1'][0]
tn_svm_os = cm_svm_tuned_os['Pred 0'][1]
fp_svm_os = cm_svm_tuned_os['Pred 1'][1]
fn_svm_os = cm_svm_tuned_os['Pred 0'][0]

#### Smote without Scaling

In [19]:
svm_sm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
svm_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.6min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [20]:
svm_sm_tuned = svm_sm.best_estimator_
pred_train_sm = svm_sm_tuned.predict(x_train_sm)
pred_test_sm = svm_sm_tuned.predict(x_test)
svm_sm_tuned

SVC(C=0.1, gamma=1.01, max_iter=300)

In [21]:
svm_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
svm_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
svm_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
svm_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
svm_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
svm_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
svm_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
svm_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [22]:
cm_svm_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_svm_tuned_sm = pd.DataFrame(data=cm_svm_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,260,0
Akt 0,1526,23


In [23]:
tp_svm_sm = cm_svm_tuned_sm['Pred 1'][0]
tn_svm_sm = cm_svm_tuned_sm['Pred 0'][1]
fp_svm_sm = cm_svm_tuned_sm['Pred 1'][1]
fn_svm_sm = cm_svm_tuned_sm['Pred 0'][0]

#### Random Over Sampling with Standard Scaler

In [24]:
svm_os_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
svm_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.4min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [25]:
svm_os_std_tuned = svm_os_std.best_estimator_
pred_train_os_std = svm_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = svm_os_std_tuned.predict(x_test)
svm_os_std_tuned

SVC(C=0.1, gamma=8.01, max_iter=300)

In [26]:
svm_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
svm_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)
svm_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
svm_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)
svm_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
svm_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)
svm_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
svm_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [27]:
cm_svm_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_svm_tuned_os_std = pd.DataFrame(data=cm_svm_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,243,17
Akt 0,48,1501


In [28]:
tp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][0]
tn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][1]
fp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][1]
fn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][0]

#### Random Over Sampling with MinMax Scaler

In [29]:
svm_os_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
svm_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.4min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [30]:
svm_os_mm_tuned = svm_os_mm.best_estimator_
pred_train_os_mm = svm_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = svm_os_mm_tuned.predict(x_test)
svm_os_mm_tuned

SVC(C=1.1, gamma=5.01, max_iter=300)

In [31]:
svm_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
svm_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
svm_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
svm_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)
svm_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
svm_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)
svm_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
svm_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [32]:
cm_svm_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_svm_tuned_os_mm = pd.DataFrame(data=cm_svm_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,256,4
Akt 0,1544,5


In [33]:
tp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][0]
tn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][1]
fp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][1]
fn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][0]

#### Random Over Sampling with Robust Scaler

In [34]:
svm_os_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
svm_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [35]:
svm_os_rb_tuned = svm_os_rb.best_estimator_
pred_train_os_rb = svm_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = svm_os_rb_tuned.predict(x_test)
svm_os_rb_tuned

SVC(C=0.1, gamma=5.01, max_iter=300)

In [36]:
svm_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
svm_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
svm_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
svm_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)
svm_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
svm_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)
svm_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
svm_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [37]:
cm_svm_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_svm_tuned_os_rb = pd.DataFrame(data=cm_svm_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,243,17
Akt 0,63,1486


In [38]:
tp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][0]
tn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][1]
fp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][1]
fn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][0]

#### Smote with Standard Scaler

In [39]:
svm_sm_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [40]:
svm_sm_std_tuned = svm_sm_std.best_estimator_
pred_train_sm_std = svm_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = svm_sm_std_tuned.predict(x_test)
svm_sm_std_tuned

SVC(C=0.1, gamma=8.01, max_iter=300)

In [41]:
svm_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
svm_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
svm_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
svm_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)
svm_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
svm_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)
svm_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
svm_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [42]:
cm_svm_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_svm_tuned_sm_std = pd.DataFrame(data=cm_svm_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,242,18
Akt 0,658,891


In [43]:
tp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][0]
tn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][1]
fp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][1]
fn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][0]

#### Smote with MinMax Scaler

In [44]:
svm_sm_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.4min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [45]:
svm_sm_mm_tuned = svm_sm_mm.best_estimator_
pred_train_sm_mm = svm_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = svm_sm_mm_tuned.predict(x_test)
svm_sm_mm_tuned

SVC(C=1.1, gamma=7.01, max_iter=300)

In [46]:
svm_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
svm_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
svm_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
svm_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
svm_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
svm_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
svm_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
svm_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [47]:
cm_svm_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_svm_tuned_sm_mm = pd.DataFrame(data=cm_svm_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,258,2
Akt 0,1544,5


In [48]:
tp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][0]
tn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][1]
fp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][1]
fn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][0]

#### Smote with Robust Scaler

In [49]:
svm_sm_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=300), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                                        'gamma': array([0.01, 1.01, 2.01, 3.01, 4.01, 5.01, 6.01, 7.01, 8.01, 9.01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [50]:
svm_sm_rb_tuned = svm_sm_rb.best_estimator_
pred_train_sm_rb = svm_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = svm_sm_rb_tuned.predict(x_test)
svm_sm_rb_tuned

SVC(C=1.1, gamma=1.01, max_iter=300)

In [51]:
svm_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
svm_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
svm_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
svm_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
svm_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
svm_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
svm_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
svm_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [52]:
cm_svm_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_svm_tuned_sm_rb = pd.DataFrame(data=cm_svm_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,249,11
Akt 0,959,590


In [53]:
tp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][0]
tn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][1]
fp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][1]
fn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][0]

## Logistic Regression

In [54]:
logreg = LogisticRegression()

In [55]:
param_logreg = {'C' : np.arange(0.0001, 100),
                'penalty' : ['l1', 'l2', 'elasticnet', None],
                'class_weight': [None, 'weight'],
                'fit_intercept' : [True, False]}

#### Random Over Sampling without Scaling

In [56]:
logreg_os = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
logreg_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   30.7s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [57]:
logreg_os_tuned = logreg_os.best_estimator_
pred_train_os = logreg_os_tuned.predict(x_train_os)
pred_test_os = logreg_os_tuned.predict(x_test)
logreg_os_tuned

LogisticRegression(C=9.0001, class_weight='weight')

In [58]:
logreg_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
logreg_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
logreg_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
logreg_recall_tuned_test_os = recall_score(y_test, pred_test_os)
logreg_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
logreg_prec_tuned_test_os = precision_score(y_test, pred_test_os)
logreg_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
logreg_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [59]:
cm_logreg_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_logreg_tuned_os = pd.DataFrame(data=cm_logreg_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,190,70
Akt 0,452,1097


In [60]:
tp_logreg_os = cm_logreg_tuned_os['Pred 1'][0]
tn_logreg_os = cm_logreg_tuned_os['Pred 0'][1]
fp_logreg_os = cm_logreg_tuned_os['Pred 1'][1]
fn_logreg_os = cm_logreg_tuned_os['Pred 0'][0]

#### Smote Sampling without Scaling

In [61]:
logreg_sm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
logreg_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   28.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   29.7s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [62]:
logreg_sm_tuned = logreg_sm.best_estimator_
pred_train_sm = logreg_sm_tuned.predict(x_train_sm)
pred_test_sm = logreg_sm_tuned.predict(x_test)
logreg_sm_tuned

LogisticRegression(C=34.0001)

In [63]:
logreg_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
logreg_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
logreg_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
logreg_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
logreg_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
logreg_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
logreg_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
logreg_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [64]:
cm_logreg_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_logreg_tuned_sm = pd.DataFrame(data=cm_logreg_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,48,212
Akt 0,29,1520


In [65]:
tp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][0]
tn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][1]
fp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][1]
fn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][0]

#### Standard Scaling with Random Over Sampling

In [66]:
logreg_os_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 193 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 648 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   27.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   27.9s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [67]:
logreg_os_std_tuned = logreg_os_std.best_estimator_
pred_train_os_std = logreg_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = logreg_os_std_tuned.predict(x_test)
logreg_os_std_tuned

LogisticRegression(C=1.0001)

In [68]:
logreg_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
logreg_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)
logreg_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
logreg_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)
logreg_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
logreg_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)
logreg_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
logreg_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [69]:
cm_logreg_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_logreg_tuned_os_std = pd.DataFrame(data=cm_logreg_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,194,66
Akt 0,457,1092


In [70]:
tp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][0]
tn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][1]
fp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][1]
fn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][0]

#### MinMax Scaling with Random Over Sampling

In [71]:
logreg_os_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   31.2s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [72]:
logreg_os_mm_tuned = logreg_os_mm.best_estimator_
pred_train_os_mm = logreg_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = logreg_os_mm_tuned.predict(x_test)
logreg_os_mm_tuned

LogisticRegression(C=6.0001)

In [73]:
logreg_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
logreg_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
logreg_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
logreg_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)
logreg_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
logreg_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)
logreg_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
logreg_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [74]:
cm_logreg_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_logreg_tuned_os_mm = pd.DataFrame(data=cm_logreg_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,183,77
Akt 0,422,1127


In [75]:
tp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][0]
tn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][1]
fp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][1]
fn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][0]

#### Robust Scaling with Random Over Sampling

In [76]:
logreg_os_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 564 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   29.2s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [77]:
logreg_os_rb_tuned = logreg_os_rb.best_estimator_
pred_train_os_rb = logreg_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = logreg_os_rb_tuned.predict(x_test)
logreg_os_rb_tuned

LogisticRegression(C=0.0001, fit_intercept=False)

In [78]:
logreg_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
logreg_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
logreg_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
logreg_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)
logreg_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
logreg_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)
logreg_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
logreg_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [79]:
cm_logreg_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_logreg_tuned_os_rb = pd.DataFrame(data=cm_logreg_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,213,47
Akt 0,740,809


In [80]:
tp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][0]
tn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][1]
fp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][1]
fn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][0]

#### Standard Scaling with Smote

In [81]:
logreg_sm_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   24.9s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [82]:
logreg_sm_std_tuned = logreg_sm_std.best_estimator_
pred_train_sm_std = logreg_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = logreg_sm_std_tuned.predict(x_test)
logreg_sm_std_tuned

LogisticRegression(C=59.0001, class_weight='weight')

In [83]:
logreg_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
logreg_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
logreg_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
logreg_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)
logreg_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
logreg_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)
logreg_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
logreg_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [84]:
cm_logreg_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_logreg_tuned_sm_std = pd.DataFrame(data=cm_logreg_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,52,208
Akt 0,33,1516


In [85]:
tp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][0]
tn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][1]
fp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][1]
fn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][0]

#### MinMax Scaling with Smote

In [86]:
logreg_sm_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   27.6s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [87]:
logreg_sm_mm_tuned = logreg_sm_mm.best_estimator_
pred_train_sm_mm = logreg_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = logreg_sm_mm_tuned.predict(x_test)
logreg_sm_mm_tuned

LogisticRegression(C=91.0001, class_weight='weight')

In [88]:
logreg_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
logreg_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
logreg_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
logreg_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
logreg_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
logreg_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
logreg_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
logreg_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [89]:
cm_logreg_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_logreg_tuned_sm_mm = pd.DataFrame(data=cm_logreg_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,46,214
Akt 0,24,1525


In [90]:
tp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][0]
tn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][1]
fp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][1]
fn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][0]

#### Robust Scaling with Smote

In [91]:
logreg_sm_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   28.8s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-04, 1.00010e+00, 2.00010e+00, 3.00010e+00, 4.00010e+00,
       5.00010e+00, 6.00010e+00, 7.00010e+00, 8.00010e+00, 9.00010e+00,
       1.00001e+01, 1.10001e+01, 1.20001e+01, 1.30001e+01, 1.40001e+01,
       1.50001e+01, 1.60001e+01, 1.70001e+01, 1.80001e+01, 1.90001e+01,
       2....
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [92]:
logreg_sm_rb_tuned = logreg_sm_rb.best_estimator_
pred_train_sm_rb = logreg_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = logreg_sm_rb_tuned.predict(x_test)
logreg_sm_rb_tuned

LogisticRegression(C=62.0001, class_weight='weight')

In [93]:
logreg_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
logreg_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
logreg_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
logreg_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
logreg_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
logreg_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
logreg_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
logreg_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [94]:
cm_logreg_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_logreg_tuned_sm_rb = pd.DataFrame(data=cm_logreg_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,47,213
Akt 0,28,1521


In [95]:
tp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][0]
tn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][1]
fp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][1]
fn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][0]

### Evaluation For Logistic Regression

In [96]:
distance_tuned = {
    "SVM_OS_Train": [svm_acc_tuned_train_os, svm_recall_tuned_train_os, svm_prec_tuned_train_os, svm_f1_tuned_train_os],
    "SVM_OS_Test" : [svm_acc_tuned_test_os, svm_recall_tuned_test_os, svm_prec_tuned_test_os, svm_f1_tuned_test_os],
    "SVM_SM_Train": [svm_acc_tuned_train_sm, svm_recall_tuned_train_sm, svm_prec_tuned_train_sm, svm_f1_tuned_train_sm],
    "SVM_SM_Test" : [svm_acc_tuned_test_sm, svm_recall_tuned_test_sm, svm_prec_tuned_test_sm, svm_f1_tuned_test_sm],
    "Logreg_OS_Train": [logreg_acc_tuned_train_os, logreg_recall_tuned_train_os, logreg_prec_tuned_train_os, logreg_f1_tuned_train_os],
    "Logreg_OS_Test" : [logreg_acc_tuned_test_os, logreg_recall_tuned_test_os, logreg_prec_tuned_test_os, logreg_f1_tuned_test_os],
    "Logreg_SM_Train": [logreg_acc_tuned_train_sm, logreg_recall_tuned_train_sm, logreg_prec_tuned_train_sm, logreg_f1_tuned_train_sm],
    "Logreg_SM_Test" : [logreg_acc_tuned_test_sm, logreg_recall_tuned_test_sm, logreg_prec_tuned_test_sm, logreg_f1_tuned_test_sm]
    }
tuned_matrix = pd.DataFrame(data = distance_tuned, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

dictance_tuned_os = {
    "SVM Standard_OS_Train": [svm_acc_tuned_train_os_std, svm_recall_tuned_train_os_std, svm_prec_tuned_train_os_std, svm_f1_tuned_train_os_std],
    "SVM Standard_OS_Test" : [svm_acc_tuned_test_os_std, svm_recall_tuned_test_os_std, svm_prec_tuned_test_os_std, svm_f1_tuned_test_os_std],
    "SVM MinMax_OS_Train": [svm_acc_tuned_train_os_mm, svm_recall_tuned_train_os_mm, svm_prec_tuned_train_os_mm, svm_f1_tuned_train_os_mm],
    "SVM MinMax_OS_Test" : [svm_acc_tuned_test_os_mm, svm_recall_tuned_test_os_mm, svm_prec_tuned_test_os_mm, svm_f1_tuned_test_os_mm],
    "SVM Robust_OS_Train": [svm_acc_tuned_train_os_rb, svm_recall_tuned_train_os_rb, svm_prec_tuned_train_os_rb, svm_f1_tuned_train_os_rb],
    "SVM Robust_OS_Test" : [svm_acc_tuned_test_os_rb, svm_recall_tuned_test_os_rb, svm_prec_tuned_test_os_rb, svm_f1_tuned_test_os_rb],
    "Logreg Standard_OS_Train": [logreg_acc_tuned_train_os_std, logreg_recall_tuned_train_os_std, logreg_prec_tuned_train_os_std, logreg_f1_tuned_train_os_std],
    "Logreg Standard_OS_Test" : [logreg_acc_tuned_test_os_std, logreg_recall_tuned_test_os_std, logreg_prec_tuned_test_os_std, logreg_f1_tuned_test_os_std],
    "Logreg MinMax_OS_Train": [logreg_acc_tuned_train_os_mm, logreg_recall_tuned_train_os_mm, logreg_prec_tuned_train_os_mm, logreg_f1_tuned_train_os_mm],
    "Logreg MinMax_OS_Test" : [logreg_acc_tuned_test_os_mm, logreg_recall_tuned_test_os_mm, logreg_prec_tuned_test_os_mm, logreg_f1_tuned_test_os_mm],
    "Logreg Robust_OS_Train": [logreg_acc_tuned_train_os_rb, logreg_recall_tuned_train_os_rb, logreg_prec_tuned_train_os_rb, logreg_f1_tuned_train_os_rb],
    "Logreg Robust_OS_Test" : [logreg_acc_tuned_test_os_rb, logreg_recall_tuned_test_os_rb, logreg_prec_tuned_test_os_rb, logreg_f1_tuned_test_os_rb]
    }
distance_tuned_os_matrix = pd.DataFrame(data = dictance_tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

distance_tuned_sm = {
    "SVM Standard_SM_Train": [svm_acc_tuned_train_sm_std, svm_recall_tuned_train_sm_std, svm_prec_tuned_train_sm_std, svm_f1_tuned_train_sm_std],
    "SVM Standard_SM_Test" : [svm_acc_tuned_test_sm_std, svm_recall_tuned_test_sm_std, svm_prec_tuned_test_sm_std, svm_f1_tuned_test_sm_std],
    "SVM MinMax_SM_Train": [svm_acc_tuned_train_sm_mm, svm_recall_tuned_train_sm_mm, svm_prec_tuned_train_sm_mm, svm_f1_tuned_train_sm_mm],
    "SVM MinMax_SM_Test" : [svm_acc_tuned_test_sm_mm, svm_recall_tuned_test_sm_mm, svm_prec_tuned_test_sm_mm, svm_f1_tuned_test_sm_mm],
    "SVM Robust_SM_Train": [svm_acc_tuned_train_sm_rb, svm_recall_tuned_train_sm_rb, svm_prec_tuned_train_sm_rb, svm_f1_tuned_train_sm_rb],
    "SVM Robust_SM_Test" : [svm_acc_tuned_test_sm_rb, svm_recall_tuned_test_sm_rb, svm_prec_tuned_test_sm_rb, svm_f1_tuned_test_sm_rb],
    "Logreg Standard_SM_Train": [logreg_acc_tuned_train_sm_std, logreg_recall_tuned_train_sm_std, logreg_prec_tuned_train_sm_std, logreg_f1_tuned_train_sm_std],
    "Logreg Standard_SM_Test" : [logreg_acc_tuned_test_sm_std, logreg_recall_tuned_test_sm_std, logreg_prec_tuned_test_sm_std, logreg_f1_tuned_test_sm_std],
    "Logreg MinMax_SM_Train": [logreg_acc_tuned_train_sm_mm, logreg_recall_tuned_train_sm_mm, logreg_prec_tuned_train_sm_mm, logreg_f1_tuned_train_sm_mm],
    "Logreg MinMax_SM_Test" : [logreg_acc_tuned_test_sm_mm, logreg_recall_tuned_test_sm_mm, logreg_prec_tuned_test_sm_mm, logreg_f1_tuned_test_sm_mm],
    "Logreg Robust_SM_Train": [logreg_acc_tuned_train_sm_rb, logreg_recall_tuned_train_sm_rb, logreg_prec_tuned_train_sm_rb, logreg_f1_tuned_train_sm_rb],
    "Logreg Robust_SM_Test" : [logreg_acc_tuned_test_sm_rb, logreg_recall_tuned_test_sm_rb, logreg_prec_tuned_test_sm_rb, logreg_f1_tuned_test_sm_rb]
    }
distance_tuned_sm_matrix = pd.DataFrame(data = distance_tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

In [97]:
tuned_matrix

Unnamed: 0,SVM_OS_Train,SVM_OS_Test,SVM_SM_Train,SVM_SM_Test,Logreg_OS_Train,Logreg_OS_Test,Logreg_SM_Train,Logreg_SM_Test
Accuracy,0.999839,0.999447,0.531078,0.15644,0.74217,0.711443,0.909832,0.866777
Recall,1.0,1.0,1.0,1.0,0.783662,0.730769,0.836939,0.184615
Precision,0.999677,0.996169,0.516038,0.145577,0.723614,0.29595,0.979777,0.623377
F1 Score,0.999839,0.998081,0.680772,0.254154,0.752441,0.421286,0.902743,0.284866


In [98]:
distance_tuned_os_matrix

Unnamed: 0,SVM Standard_OS_Train,SVM Standard_OS_Test,SVM MinMax_OS_Train,SVM MinMax_OS_Test,SVM Robust_OS_Train,SVM Robust_OS_Test,Logreg Standard_OS_Train,Logreg Standard_OS_Test,Logreg MinMax_OS_Train,Logreg MinMax_OS_Test,Logreg Robust_OS_Train,Logreg Robust_OS_Test
Accuracy,1.0,0.964069,0.525509,0.144279,1.0,0.955777,0.742573,0.71089,0.742735,0.724157,0.694866,0.564953
Recall,1.0,0.934615,1.0,0.984615,1.0,0.934615,0.783662,0.746154,0.785115,0.703846,0.828705,0.819231
Precision,1.0,0.835052,0.513088,0.142222,1.0,0.794118,0.724153,0.298003,0.723768,0.302479,0.653719,0.223505
F1 Score,1.0,0.882033,0.6782,0.248544,1.0,0.858657,0.752733,0.425906,0.753194,0.423121,0.730884,0.351195


In [99]:
distance_tuned_sm_matrix

Unnamed: 0,SVM Standard_SM_Train,SVM Standard_SM_Test,SVM MinMax_SM_Train,SVM MinMax_SM_Test,SVM Robust_SM_Train,SVM Robust_SM_Test,Logreg Standard_SM_Train,Logreg Standard_SM_Test,Logreg MinMax_SM_Train,Logreg MinMax_SM_Test,Logreg Robust_SM_Train,Logreg Robust_SM_Test
Accuracy,0.796416,0.626313,0.525912,0.145384,0.712625,0.463792,0.908783,0.866777,0.909832,0.868436,0.909106,0.866777
Recall,0.992251,0.930769,1.0,0.992308,1.0,0.957692,0.835325,0.2,0.8371,0.176923,0.836293,0.180769
Precision,0.712993,0.268889,0.513301,0.143174,0.635022,0.206126,0.979182,0.611765,0.979596,0.657143,0.978836,0.626667
F1 Score,0.829756,0.417241,0.678386,0.250242,0.776775,0.339237,0.901551,0.301449,0.90276,0.278788,0.901968,0.280597


## Decision Tree

In [100]:
decision_tree = DecisionTreeClassifier()

In [101]:
param_dt =  {"criterion" : ['gini', 'entropy'],
             "max_depth": [None, np.arange(2,50)], 
             "min_samples_split": [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
             "min_samples_leaf": [1,2, 3, 4, 5, 6, 7, 8, 9, 10], 
             "max_features" : ['auto', 'sqrt', 'log2'],
             "min_impurity_decrease" : [0, 1, 2, 3, 4, 5]}

### Random Over Sampling

In [102]:
dt_os = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
dt_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   14.7s finished


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None,
                                                      array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_impurity_decrease': [0, 1, 2, 3, 4,
                                                                  5],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10],
                                        'min_samp

In [103]:
dt_tuned_os = dt_os.best_estimator_
pred_train_os = dt_tuned_os.predict(x_train_os)
pred_test_os = dt_tuned_os.predict(x_test)
dt_tuned_os

DecisionTreeClassifier(criterion='entropy', max_features='log2',
                       min_impurity_decrease=0)

In [104]:
dt_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
dt_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
dt_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
dt_recall_tuned_test_os = recall_score(y_test, pred_test_os)
dt_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
dt_prec_tuned_test_os = precision_score(y_test, pred_test_os)
dt_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
dt_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [105]:
cm_dt_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_dt_tuned_os = pd.DataFrame(data=cm_dt_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,158,102
Akt 0,83,1466


In [106]:
tp_dt_os = cm_dt_tuned_os['Pred 1'][0]
tn_dt_os = cm_dt_tuned_os['Pred 0'][1]
fp_dt_os = cm_dt_tuned_os['Pred 1'][1]
fn_dt_os = cm_dt_tuned_os['Pred 0'][0]

### Smote

In [107]:
dt_sm = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
dt_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   13.3s finished


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None,
                                                      array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_impurity_decrease': [0, 1, 2, 3, 4,
                                                                  5],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10],
                                        'min_samp

In [108]:
dt_tuned_sm = dt_sm.best_estimator_
pred_train_os = dt_tuned_sm.predict(x_train_os)
pred_test_os = dt_tuned_sm.predict(x_test)
dt_tuned_sm

DecisionTreeClassifier(max_features='auto', min_impurity_decrease=0,
                       min_samples_leaf=3)

In [109]:
dt_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
dt_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
dt_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
dt_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
dt_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
dt_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
dt_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
dt_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [110]:
cm_dt_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_dt_tuned_sm = pd.DataFrame(data=cm_dt_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,48,212
Akt 0,29,1520


In [111]:
tp_dt_sm = cm_dt_tuned_sm['Pred 1'][0]
tn_dt_sm = cm_dt_tuned_sm['Pred 0'][1]
fp_dt_sm = cm_dt_tuned_sm['Pred 1'][1]
fn_dt_sm = cm_dt_tuned_sm['Pred 0'][0]

## Random Forest

In [112]:
random_forest = RandomForestClassifier()

In [113]:
param_rf =  {"n_estimators":np.arange(100, 1000),
             "criterion" : ['gini', 'entropy'],
             "max_depth": [None, np.arange(2, 20)], 
             "min_samples_split": np.arange(2, 19),
             "min_samples_leaf": [1,2, 3, 4, 5], 
             "max_features" : ['auto', 'sqrt', 'log2'],
             "min_impurity_decrease" : [0, 1, 2, 3, 4, 5]}

### Random Over Sampling

In [114]:
rf_os = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
rf_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 10.5min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None,
                                                      array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_impurity_decrease': [0, 1, 2, 3, 4,
                                                                  5],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': array([ 2,  3,  4,  5,  6,...
       919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931,
       932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944,
       945, 946, 9

In [115]:
rf_tuned_os = rf_os.best_estimator_
pred_train_os = rf_tuned_os.predict(x_train_os)
pred_test_os = rf_tuned_os.predict(x_test)
rf_tuned_os

RandomForestClassifier(criterion='entropy', max_features='sqrt',
                       min_impurity_decrease=3, min_samples_leaf=4,
                       min_samples_split=6, n_estimators=449)

In [116]:
rf_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
rf_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
rf_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
rf_recall_tuned_test_os = recall_score(y_test, pred_test_os)
rf_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
rf_prec_tuned_test_os = precision_score(y_test, pred_test_os)
rf_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
rf_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [117]:
cm_rf_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_rf_tuned_os = pd.DataFrame(data=cm_rf_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,0,260
Akt 0,0,1549


In [118]:
tp_rf_os = cm_rf_tuned_os['Pred 1'][0]
tn_rf_os = cm_rf_tuned_os['Pred 0'][1]
fp_rf_os = cm_rf_tuned_os['Pred 1'][1]
fn_rf_os = cm_rf_tuned_os['Pred 0'][0]

### Smote

In [119]:
rf_sm = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
rf_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None,
                                                      array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_impurity_decrease': [0, 1, 2, 3, 4,
                                                                  5],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': array([ 2,  3,  4,  5,  6,...
       919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931,
       932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944,
       945, 946, 9

In [122]:
rf_tuned_sm = rf_sm.best_estimator_
pred_train_sm = rf_tuned_sm.predict(x_train_sm)
pred_test_sm = rf_tuned_sm.predict(x_test)
rf_tuned_sm

RandomForestClassifier(criterion='entropy', max_features='log2',
                       min_impurity_decrease=1, min_samples_split=15,
                       n_estimators=113)

In [123]:
rf_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
rf_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
rf_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
rf_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
rf_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
rf_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
rf_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
rf_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [124]:
cm_rf_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_rf_tuned_sm = pd.DataFrame(data=cm_rf_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,260,0
Akt 0,1549,0


In [125]:
tp_rf_sm = cm_rf_tuned_sm['Pred 1'][0]
tn_rf_sm = cm_rf_tuned_sm['Pred 0'][1]
fp_rf_sm = cm_rf_tuned_sm['Pred 1'][1]
fn_rf_sm = cm_rf_tuned_sm['Pred 0'][0]

## XGBoost

In [127]:
xgb = XGBClassifier()

In [128]:
param_xgb =  {"n_estimators" : np.arange(100, 200),
              "learning_rate" : np.arange(0.0001, 10),
              "max_depth": [None, np.arange(2,50)],
              "colsample_bytree" : [None, .1, .2, .3, .4, .5, .6, .7, .8],
              "subsample": [None, .1, .2, .3, .4, .5, .6, .7, .8],
              "min_samples_split": [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
              "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
              "min_impurity_decrease" : [0, 1, 2, 3, 4, 5]}

### Random Over Sampling

In [129]:
xgb_os = RandomizedSearchCV(estimator = xgb, param_distributions = param_xgb, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
xgb_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  4.9min finished


Parameters: { min_impurity_decrease, min_samples_leaf, min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
       152, 153, 154, 155, 156, 157, 

In [130]:
xgb_tuned_os = xgb_os.best_estimator_
pred_train_os = xgb_tuned_os.predict(x_train_os)
pred_test_os = xgb_tuned_os.predict(x_test)
xgb_tuned_os

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=1.0001, max_delta_step=0, max_depth=6,
              min_child_weight=1, min_impurity_decrease=3, min_samples_leaf=2,
              min_samples_split=50, missing=nan, monotone_constraints='()',
              n_estimators=183, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [131]:
xgb_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
xgb_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
xgb_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
xgb_recall_tuned_test_os = recall_score(y_test, pred_test_os)
xgb_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
xgb_prec_tuned_test_os = precision_score(y_test, pred_test_os)
xgb_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
xgb_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [132]:
cm_xgb_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_xgb_tuned_os = pd.DataFrame(data=cm_xgb_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_xgb_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,140,120
Akt 0,27,1522


In [133]:
tp_xgb_os = cm_xgb_tuned_os['Pred 1'][0]
tn_xgb_os = cm_xgb_tuned_os['Pred 0'][1]
fp_xgb_os = cm_xgb_tuned_os['Pred 1'][1]
fn_xgb_os = cm_xgb_tuned_os['Pred 0'][0]

### Smote

In [134]:
xgb_sm = RandomizedSearchCV(estimator = xgb, param_distributions = param_xgb, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
xgb_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  6.0min finished


Parameters: { min_impurity_decrease, min_samples_leaf, min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
       152, 153, 154, 155, 156, 157, 

In [135]:
xgb_tuned_sm = xgb_sm.best_estimator_
pred_train_sm = xgb_tuned_sm.predict(x_train_sm)
pred_test_sm = xgb_tuned_sm.predict(x_test)
xgb_tuned_sm

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=5.0001, max_delta_step=0, max_depth=6,
              min_child_weight=1, min_impurity_decrease=2, min_samples_leaf=7,
              min_samples_split=30, missing=nan, monotone_constraints='()',
              n_estimators=179, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [136]:
xgb_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
xgb_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
xgb_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
xgb_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
xgb_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
xgb_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
xgb_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
xgb_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [137]:
cm_xgb_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_xgb_tuned_sm = pd.DataFrame(data=cm_xgb_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_xgb_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,260,0
Akt 0,1549,0


In [138]:
tp_xgb_sm = cm_xgb_tuned_sm['Pred 1'][0]
tn_xgb_sm = cm_xgb_tuned_sm['Pred 0'][1]
fp_xgb_sm = cm_xgb_tuned_sm['Pred 1'][1]
fn_xgb_sm = cm_xgb_tuned_sm['Pred 0'][0]

## Evaluation Matrix For DT, RF, XGB

In [139]:
tuned_os = {
    "DT OS Train": [dt_acc_tuned_train_os, dt_recall_tuned_train_os, dt_prec_tuned_train_os, dt_f1_tuned_train_os],
    "DT OS Test" : [dt_acc_tuned_test_os, dt_recall_tuned_test_os, dt_prec_tuned_test_os, dt_f1_tuned_test_os],
    "RF OS Train": [rf_acc_tuned_train_os, rf_recall_tuned_train_os, rf_prec_tuned_train_os, rf_f1_tuned_train_os],
    "RF OS Test" : [rf_acc_tuned_test_os, rf_recall_tuned_test_os, rf_prec_tuned_test_os, rf_f1_tuned_test_os],
    "XGB OS Train": [xgb_acc_tuned_train_os, xgb_recall_tuned_train_os, xgb_prec_tuned_train_os, xgb_f1_tuned_train_os],
    "XGB OS Test" : [xgb_acc_tuned_test_os, xgb_recall_tuned_test_os, xgb_prec_tuned_test_os, xgb_f1_tuned_test_os]}

tuned_os_matrix = pd.DataFrame(data = tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

tuned_sm = {
    "DT SM Train": [dt_acc_tuned_train_sm, dt_recall_tuned_train_sm, dt_prec_tuned_train_sm, dt_f1_tuned_train_sm],
    "DT SM Test" : [dt_acc_tuned_test_sm, dt_recall_tuned_test_sm, dt_prec_tuned_test_sm, dt_f1_tuned_test_sm],
    "RF SM Train": [rf_acc_tuned_train_sm, rf_recall_tuned_train_sm, rf_prec_tuned_train_sm, rf_f1_tuned_train_sm],
    "RF SM Test" : [rf_acc_tuned_test_sm, rf_recall_tuned_test_sm, rf_prec_tuned_test_sm, rf_f1_tuned_test_sm],
    "XGB SM Train": [xgb_acc_tuned_train_sm, xgb_recall_tuned_train_sm, xgb_prec_tuned_train_sm, xgb_f1_tuned_train_sm],
    "XGB SM Test" : [xgb_acc_tuned_test_sm, xgb_recall_tuned_test_sm, xgb_prec_tuned_test_sm, xgb_f1_tuned_test_sm]}

tuned_sm_matrix = pd.DataFrame(data = tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])


In [140]:
tuned_os_matrix

Unnamed: 0,DT OS Train,DT OS Test,RF OS Train,RF OS Test,XGB OS Train,XGB OS Test
Accuracy,1.0,0.897734,0.5,0.856274,1.0,0.91874
Recall,1.0,0.607692,0.0,0.0,1.0,0.538462
Precision,1.0,0.655602,0.0,0.0,1.0,0.838323
F1 Score,1.0,0.630739,0.0,0.0,1.0,0.655738


In [141]:
tuned_sm_matrix

Unnamed: 0,DT SM Train,DT SM Test,RF SM Train,RF SM Test,XGB SM Train,XGB SM Test
Accuracy,0.909832,0.866777,0.5,0.143726,0.5,0.143726
Recall,0.836939,0.184615,1.0,1.0,1.0,1.0
Precision,0.979777,0.623377,0.5,0.143726,0.5,0.143726
F1 Score,0.902743,0.284866,0.666667,0.251329,0.666667,0.251329


In [143]:
cm = {
    "True Positive" : [tp_svm_os, tp_svm_os_std, tp_svm_os_mm, tp_svm_os_rb, 
                       tp_svm_sm, tp_svm_sm_std, tp_svm_sm_mm, tp_svm_sm_rb,
                       tp_logreg_os, tp_logreg_os_std, tp_logreg_os_mm, tp_logreg_os_rb, 
                       tp_logreg_sm, tp_logreg_sm_std, tp_logreg_sm_mm, tp_logreg_sm_rb,
                       tp_dt_os, tp_dt_sm, tp_rf_os, tp_rf_sm, tp_xgb_os, tp_xgb_sm],
    
    "True Negative" : [tn_svm_os, tn_svm_os_std, tn_svm_os_mm, tn_svm_os_rb, 
                       tn_svm_sm, tn_svm_sm_std, tn_svm_sm_mm, tn_svm_sm_rb,
                       tn_logreg_os, tn_logreg_os_std, tn_logreg_os_mm, tn_logreg_os_rb, 
                       tn_logreg_sm, tn_logreg_sm_std, tn_logreg_sm_mm, tn_logreg_sm_rb,
                       tn_dt_os, tn_dt_sm, tn_rf_os, tn_rf_sm, tn_xgb_os, tn_xgb_sm],
    
    "False Positive": [fp_svm_os, fp_svm_os_std, fp_svm_os_mm, fp_svm_os_rb, 
                       fp_svm_sm, fp_svm_sm_std, fp_svm_sm_mm, fp_svm_sm_rb,
                       fp_logreg_os, fp_logreg_os_std, fp_logreg_os_mm, fp_logreg_os_rb, 
                       fp_logreg_sm, fp_logreg_sm_std, fp_logreg_sm_mm, fp_logreg_sm_rb,
                       fp_dt_os, fp_dt_sm, fp_rf_os, fp_rf_sm, fp_xgb_os, fp_xgb_sm],
    
    "False Negative": [fn_svm_os, fn_svm_os_std, fn_svm_os_mm, fn_svm_os_rb, 
                       fn_svm_sm, fn_svm_sm_std, fn_svm_sm_mm, fn_svm_sm_rb,
                       fn_logreg_os, fn_logreg_os_std, fn_logreg_os_mm, fn_logreg_os_rb, 
                       fn_logreg_sm, fn_logreg_sm_std, fn_logreg_sm_mm, fn_logreg_sm_rb,
                       fn_dt_os, fn_dt_sm, fn_rf_os, fn_rf_sm, fn_xgb_os, fn_xgb_sm]
}
    
cm_matrix = pd.DataFrame(data = cm, index = ['SVM OS', 'Logreg OS Standard', 'Logreg OS MinMax', 'Logreg OS Robust',
                                             'SVM SM', 'Logreg SM Standard', 'Logreg SM MinMax', 'Logreg SM Robust',
                                             'LogReg OS', 'Logreg OS Standard', 'Logreg OS MinMax', 'Logreg OS Robust',
                                             'LogReg SM', 'Logreg SM Standard', 'Logreg SM MinMax', 'Logreg SM Robust',
                                             'Decision Tree OS', 'Decision Tree SM', 
                                             'Random Forest OS', 'Random Forest SM',
                                             'XGBoost OS', 'XGBoost SM'])
cm_matrix.sort_values('False Negative')
#Random Forest With Random Over Sampling and No Scaling

Unnamed: 0,True Positive,True Negative,False Positive,False Negative
SVM OS,260,1548,1,0
Random Forest SM,260,0,1549,0
SVM SM,260,23,1526,0
XGBoost SM,260,0,1549,0
Logreg SM MinMax,258,5,1544,2
Logreg OS MinMax,256,5,1544,4
Logreg SM Robust,249,590,959,11
Logreg OS Standard,243,1501,48,17
Logreg OS Robust,243,1486,63,17
Logreg SM Standard,242,891,658,18
