In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## Dataset

In [29]:
df_processed = pd.read_csv("data_ml.csv", index_col = 0)
df_processed.head()

Unnamed: 0_level_0,Customer Lifetime Value,Response,Coverage,Education,Gender,Income,Monthly Premium Auto,Number of Policies,Total Claim Amount,Vehicle Size,...,Sales Channel_Agent,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,2763.519279,0,0,2,1,56274,69,1,384.811147,1,...,1,0,0,0,0,0,0,0,0,1
QZ44356,6979.535903,0,1,2,1,0,94,8,1131.464935,1,...,1,0,0,0,1,0,0,0,0,0
AI49188,12887.43165,0,2,2,1,48767,108,2,566.472247,1,...,1,0,0,0,0,0,0,0,0,1
WW63253,7645.861827,0,0,2,0,0,106,7,529.881344,1,...,0,0,1,0,0,0,0,1,0,0
HB64268,2813.692575,0,0,2,0,43836,73,1,138.130879,1,...,1,0,0,0,1,0,0,0,0,0


# Splitting Data

In [30]:
x = df_processed.drop(columns = ['Response'] , axis = 1)
y = df_processed['Response']

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, stratify= y, random_state = 42)

# Handling Imbalance Data

In [32]:
y_train.value_counts()

0    6194
1    1042
Name: Response, dtype: int64

#### Random Over Sampling

In [33]:
df_train = pd.concat([x_train, y_train], axis=1)
not_renewal = df_train[df_train['Response'] == 0]
renewal = df_train[df_train['Response'] == 1]

renewal_oversample = resample(renewal, replace=True, n_samples = len(not_renewal), random_state = 42)
df_OverSampled = pd.concat([not_renewal, renewal_oversample])
df_OverSampled['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [34]:
x_train_os = df_OverSampled.drop(columns = ['Response'])
y_train_os = df_OverSampled['Response']

#### Smote

In [35]:
sm = SMOTE(random_state=42)
x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

x = pd.DataFrame(data = x_train_sm, columns = x_train.columns)
y = pd.DataFrame(data = y_train_sm, columns = ['Response'])
df_smote = x.join(y)
# df_smote = pd.concat([x_train_sm, y_train_sm], axis = 1)
df_smote['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [36]:
columns_continuous = ['Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Policies', 'Total Claim Amount']

In [37]:
std_scale = StandardScaler()
mm_scale = MinMaxScaler()
rb_scale = RobustScaler()

# Tuning Model

## SVM

In [38]:
svm = SVC(max_iter = 1000)

In [39]:
param_svm = {'C' : np.linspace(0.00001, 10, 1000),
             'kernel' : ['linear', 'rbf'],
             "gamma" : np.arange(0.0001, 100)}

#### Random Over Sampling without Scaling

In [40]:
svm_os = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
svm_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 19.0min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [41]:
svm_os_tuned = svm_os.best_estimator_
pred_train_os = svm_os_tuned.predict(x_train_os)
pred_test_os = svm_os_tuned.predict(x_test)
svm_os_tuned

SVC(C=9.659659999999999, gamma=1.0001, max_iter=1000)

In [42]:
svm_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
svm_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
svm_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
svm_recall_tuned_test_os = recall_score(y_test, pred_test_os)
svm_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
svm_prec_tuned_test_os = precision_score(y_test, pred_test_os)
svm_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
svm_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [43]:
cm_svm_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_svm_tuned_os = pd.DataFrame(data=cm_svm_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,221,39
Akt 0,0,1549


In [44]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1549
           1       1.00      0.85      0.92       260

    accuracy                           0.98      1809
   macro avg       0.99      0.93      0.95      1809
weighted avg       0.98      0.98      0.98      1809



In [45]:
tp_svm_os = cm_svm_tuned_os['Pred 1'][0]
tn_svm_os = cm_svm_tuned_os['Pred 0'][1]
fp_svm_os = cm_svm_tuned_os['Pred 1'][1]
fn_svm_os = cm_svm_tuned_os['Pred 0'][0]

#### Smote without Scaling

In [46]:
svm_sm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
svm_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 19.0min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [47]:
svm_sm_tuned = svm_sm.best_estimator_
pred_train_sm = svm_sm_tuned.predict(x_train_sm)
pred_test_sm = svm_sm_tuned.predict(x_test)
svm_sm_tuned

SVC(C=1.61162, gamma=0.0001, max_iter=1000)

In [48]:
svm_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
svm_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
svm_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
svm_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
svm_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
svm_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
svm_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
svm_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [49]:
cm_svm_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_svm_tuned_sm = pd.DataFrame(data=cm_svm_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,260,0
Akt 0,29,1520


In [50]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1549
           1       0.90      1.00      0.95       260

    accuracy                           0.98      1809
   macro avg       0.95      0.99      0.97      1809
weighted avg       0.99      0.98      0.98      1809



In [51]:
tp_svm_sm = cm_svm_tuned_sm['Pred 1'][0]
tn_svm_sm = cm_svm_tuned_sm['Pred 0'][1]
fp_svm_sm = cm_svm_tuned_sm['Pred 1'][1]
fn_svm_sm = cm_svm_tuned_sm['Pred 0'][0]

#### Random Over Sampling with Standard Scaler

In [52]:
svm_os_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
svm_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 16.5min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [53]:
svm_os_std_tuned = svm_os_std.best_estimator_
pred_train_os_std = svm_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = svm_os_std_tuned.predict(x_test)
svm_os_std_tuned

SVC(C=4.904909999999999, gamma=1.0001, max_iter=1000)

In [54]:
svm_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
svm_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)
svm_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
svm_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)
svm_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
svm_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)
svm_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
svm_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [55]:
cm_svm_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_svm_tuned_os_std = pd.DataFrame(data=cm_svm_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,213,47
Akt 0,7,1542


In [56]:
print(classification_report(y_test, pred_test_os_std))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1549
           1       0.97      0.82      0.89       260

    accuracy                           0.97      1809
   macro avg       0.97      0.91      0.94      1809
weighted avg       0.97      0.97      0.97      1809



In [57]:
tp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][0]
tn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][1]
fp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][1]
fn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][0]

#### Random Over Sampling with MinMax Scaler

In [58]:
svm_os_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
svm_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 17.4min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [59]:
svm_os_mm_tuned = svm_os_mm.best_estimator_
pred_train_os_mm = svm_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = svm_os_mm_tuned.predict(x_test)
svm_os_mm_tuned

SVC(C=0.96097, gamma=68.0001, max_iter=1000)

In [60]:
svm_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
svm_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
svm_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
svm_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)
svm_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
svm_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)
svm_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
svm_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [61]:
cm_svm_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_svm_tuned_os_mm = pd.DataFrame(data=cm_svm_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,57,203
Akt 0,0,1549


In [62]:
print(classification_report(y_test, pred_test_os_mm))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1549
           1       1.00      0.22      0.36       260

    accuracy                           0.89      1809
   macro avg       0.94      0.61      0.65      1809
weighted avg       0.90      0.89      0.86      1809



In [63]:
tp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][0]
tn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][1]
fp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][1]
fn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][0]

#### Random Over Sampling with Robust Scaler

In [64]:
svm_os_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
svm_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 17.8min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [65]:
svm_os_rb_tuned = svm_os_rb.best_estimator_
pred_train_os_rb = svm_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = svm_os_rb_tuned.predict(x_test)
svm_os_rb_tuned

SVC(C=9.749749999999999, gamma=1.0001, max_iter=1000)

In [66]:
svm_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
svm_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
svm_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
svm_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)
svm_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
svm_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)
svm_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
svm_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [67]:
cm_svm_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_svm_tuned_os_rb = pd.DataFrame(data=cm_svm_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,210,50
Akt 0,10,1539


In [68]:
print(classification_report(y_test, pred_test_os_rb))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1549
           1       0.95      0.81      0.88       260

    accuracy                           0.97      1809
   macro avg       0.96      0.90      0.93      1809
weighted avg       0.97      0.97      0.97      1809



In [69]:
tp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][0]
tn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][1]
fp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][1]
fn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][0]

#### Smote with Standard Scaler

In [70]:
svm_sm_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 17.7min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [71]:
svm_sm_std_tuned = svm_sm_std.best_estimator_
pred_train_sm_std = svm_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = svm_sm_std_tuned.predict(x_test)
svm_sm_std_tuned

SVC(C=2.98299, gamma=1.0001, max_iter=1000)

In [72]:
svm_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
svm_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
svm_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
svm_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)
svm_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
svm_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)
svm_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
svm_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [73]:
cm_svm_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_svm_tuned_sm_std = pd.DataFrame(data=cm_svm_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,227,33
Akt 0,15,1534


In [74]:
print(classification_report(y_test, pred_test_sm_std))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1549
           1       0.94      0.87      0.90       260

    accuracy                           0.97      1809
   macro avg       0.96      0.93      0.94      1809
weighted avg       0.97      0.97      0.97      1809



In [75]:
tp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][0]
tn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][1]
fp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][1]
fn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][0]

#### Smote with MinMax Scaler

In [76]:
svm_sm_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 16.9min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [77]:
svm_sm_mm_tuned = svm_sm_mm.best_estimator_
pred_train_sm_mm = svm_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = svm_sm_mm_tuned.predict(x_test)
svm_sm_mm_tuned

SVC(C=6.00601, gamma=3.0001, max_iter=1000)

In [78]:
svm_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
svm_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
svm_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
svm_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
svm_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
svm_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
svm_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
svm_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [79]:
cm_svm_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_svm_tuned_sm_mm = pd.DataFrame(data=cm_svm_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,129,131
Akt 0,16,1533


In [80]:
print(classification_report(y_test, pred_test_sm_mm))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      1549
           1       0.89      0.50      0.64       260

    accuracy                           0.92      1809
   macro avg       0.91      0.74      0.80      1809
weighted avg       0.92      0.92      0.91      1809



In [81]:
tp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][0]
tn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][1]
fp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][1]
fn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][0]

#### Smote with Robust Scaler

In [82]:
svm_sm_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 16.2min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='f1', verbose=1)

In [83]:
svm_sm_rb_tuned = svm_sm_rb.best_estimator_
pred_train_sm_rb = svm_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = svm_sm_rb_tuned.predict(x_test)
svm_sm_rb_tuned

SVC(C=7.68769, gamma=1.0001, max_iter=1000)

In [84]:
svm_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
svm_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
svm_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
svm_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
svm_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
svm_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
svm_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
svm_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [85]:
cm_svm_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_svm_tuned_sm_rb = pd.DataFrame(data=cm_svm_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,225,35
Akt 0,30,1519


In [86]:
print(classification_report(y_test, pred_test_sm_rb))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1549
           1       0.88      0.87      0.87       260

    accuracy                           0.96      1809
   macro avg       0.93      0.92      0.93      1809
weighted avg       0.96      0.96      0.96      1809



In [87]:
tp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][0]
tn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][1]
fp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][1]
fn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][0]

## Logistic Regression

In [88]:
logreg = LogisticRegression()

In [89]:
param_logreg = {'C' : np.linspace(0.00001, 10, 1000),
                'penalty' : ['l1', 'l2', 'elasticnet', None],
                'class_weight': [None, 'weight'],
                'fit_intercept' : [True, False]}

#### Random Over Sampling without Scaling

In [90]:
logreg_os = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
logreg_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 236 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 756 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   44.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   45.6s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [91]:
logreg_os_tuned = logreg_os.best_estimator_
pred_train_os = logreg_os_tuned.predict(x_train_os)
pred_test_os = logreg_os_tuned.predict(x_test)
logreg_os_tuned

LogisticRegression(C=0.3904, class_weight='weight', fit_intercept=False)

In [92]:
logreg_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
logreg_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
logreg_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
logreg_recall_tuned_test_os = recall_score(y_test, pred_test_os)
logreg_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
logreg_prec_tuned_test_os = precision_score(y_test, pred_test_os)
logreg_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
logreg_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [93]:
cm_logreg_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_logreg_tuned_os = pd.DataFrame(data=cm_logreg_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,191,69
Akt 0,453,1096


In [94]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1549
           1       0.30      0.73      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.62      1809
weighted avg       0.85      0.71      0.75      1809



In [95]:
tp_logreg_os = cm_logreg_tuned_os['Pred 1'][0]
tn_logreg_os = cm_logreg_tuned_os['Pred 0'][1]
fp_logreg_os = cm_logreg_tuned_os['Pred 1'][1]
fn_logreg_os = cm_logreg_tuned_os['Pred 0'][0]

#### Smote Sampling without Scaling

In [96]:
logreg_sm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
logreg_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   37.6s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   38.2s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [97]:
logreg_sm_tuned = logreg_sm.best_estimator_
pred_train_sm = logreg_sm_tuned.predict(x_train_sm)
pred_test_sm = logreg_sm_tuned.predict(x_test)
logreg_sm_tuned

LogisticRegression(C=7.38739)

In [98]:
logreg_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
logreg_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
logreg_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
logreg_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
logreg_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
logreg_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
logreg_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
logreg_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [99]:
cm_logreg_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_logreg_tuned_sm = pd.DataFrame(data=cm_logreg_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,48,212
Akt 0,26,1523


In [100]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.65      0.18      0.29       260

    accuracy                           0.87      1809
   macro avg       0.76      0.58      0.61      1809
weighted avg       0.84      0.87      0.84      1809



In [101]:
tp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][0]
tn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][1]
fp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][1]
fn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][0]

#### Standard Scaling with Random Over Sampling

In [102]:
logreg_os_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 860 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   46.4s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [103]:
logreg_os_std_tuned = logreg_os_std.best_estimator_
pred_train_os_std = logreg_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = logreg_os_std_tuned.predict(x_test)
logreg_os_std_tuned

LogisticRegression(C=0.25026, class_weight='weight', fit_intercept=False)

In [104]:
logreg_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
logreg_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)
logreg_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
logreg_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)
logreg_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
logreg_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)
logreg_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
logreg_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [105]:
cm_logreg_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_logreg_tuned_os_std = pd.DataFrame(data=cm_logreg_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,194,66
Akt 0,459,1090


In [106]:
print(classification_report(y_test, pred_test_os_std))

              precision    recall  f1-score   support

           0       0.94      0.70      0.81      1549
           1       0.30      0.75      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.62      1809
weighted avg       0.85      0.71      0.75      1809



In [107]:
tp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][0]
tn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][1]
fp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][1]
fn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][0]

#### MinMax Scaling with Random Over Sampling

In [108]:
logreg_os_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   42.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   43.2s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [109]:
logreg_os_mm_tuned = logreg_os_mm.best_estimator_
pred_train_os_mm = logreg_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = logreg_os_mm_tuned.predict(x_test)
logreg_os_mm_tuned

LogisticRegression(C=4.53454, class_weight='weight', fit_intercept=False)

In [110]:
logreg_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
logreg_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
logreg_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
logreg_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)
logreg_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
logreg_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)
logreg_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
logreg_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [111]:
cm_logreg_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_logreg_tuned_os_mm = pd.DataFrame(data=cm_logreg_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,183,77
Akt 0,421,1128


In [112]:
print(classification_report(y_test, pred_test_os_mm))

              precision    recall  f1-score   support

           0       0.94      0.73      0.82      1549
           1       0.30      0.70      0.42       260

    accuracy                           0.72      1809
   macro avg       0.62      0.72      0.62      1809
weighted avg       0.85      0.72      0.76      1809



In [113]:
tp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][0]
tn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][1]
fp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][1]
fn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][0]

#### Robust Scaling with Random Over Sampling

In [114]:
logreg_os_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   44.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   45.2s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [115]:
logreg_os_rb_tuned = logreg_os_rb.best_estimator_
pred_train_os_rb = logreg_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = logreg_os_rb_tuned.predict(x_test)
logreg_os_rb_tuned

LogisticRegression(C=0.35036, class_weight='weight', fit_intercept=False)

In [116]:
logreg_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
logreg_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
logreg_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
logreg_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)
logreg_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
logreg_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)
logreg_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
logreg_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [117]:
cm_logreg_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_logreg_tuned_os_rb = pd.DataFrame(data=cm_logreg_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,191,69
Akt 0,453,1096


In [118]:
print(classification_report(y_test, pred_test_os_rb))

              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1549
           1       0.30      0.73      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.62      1809
weighted avg       0.85      0.71      0.75      1809



In [119]:
tp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][0]
tn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][1]
fp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][1]
fn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][0]

#### Standard Scaling with Smote

In [120]:
logreg_sm_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 629 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   38.6s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [121]:
logreg_sm_std_tuned = logreg_sm_std.best_estimator_
pred_train_sm_std = logreg_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = logreg_sm_std_tuned.predict(x_test)
logreg_sm_std_tuned

LogisticRegression(C=9.51952, class_weight='weight')

In [122]:
logreg_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
logreg_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
logreg_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
logreg_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)
logreg_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
logreg_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)
logreg_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
logreg_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [123]:
cm_logreg_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_logreg_tuned_sm_std = pd.DataFrame(data=cm_logreg_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,51,209
Akt 0,33,1516


In [124]:
print(classification_report(y_test, pred_test_sm_std))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.61      0.20      0.30       260

    accuracy                           0.87      1809
   macro avg       0.74      0.59      0.61      1809
weighted avg       0.84      0.87      0.84      1809



In [125]:
tp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][0]
tn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][1]
fp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][1]
fn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][0]

#### MinMax Scaling with Smote

In [126]:
logreg_sm_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 436 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   39.4s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [127]:
logreg_sm_mm_tuned = logreg_sm_mm.best_estimator_
pred_train_sm_mm = logreg_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = logreg_sm_mm_tuned.predict(x_test)
logreg_sm_mm_tuned

LogisticRegression(C=5.8758799999999995, class_weight='weight')

In [128]:
logreg_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
logreg_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
logreg_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
logreg_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
logreg_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
logreg_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
logreg_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
logreg_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [129]:
cm_logreg_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_logreg_tuned_sm_mm = pd.DataFrame(data=cm_logreg_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,45,215
Akt 0,25,1524


In [130]:
print(classification_report(y_test, pred_test_sm_mm))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.64      0.17      0.27       260

    accuracy                           0.87      1809
   macro avg       0.76      0.58      0.60      1809
weighted avg       0.84      0.87      0.83      1809



In [131]:
tp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][0]
tn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][1]
fp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][1]
fn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][0]

#### Robust Scaling with Smote

In [132]:
logreg_sm_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 516 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   41.3s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [133]:
logreg_sm_rb_tuned = logreg_sm_rb.best_estimator_
pred_train_sm_rb = logreg_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = logreg_sm_rb_tuned.predict(x_test)
logreg_sm_rb_tuned

LogisticRegression(C=6.11612)

In [134]:
logreg_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
logreg_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
logreg_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
logreg_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
logreg_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
logreg_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
logreg_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
logreg_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [135]:
cm_logreg_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_logreg_tuned_sm_rb = pd.DataFrame(data=cm_logreg_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,49,211
Akt 0,28,1521


In [136]:
print(classification_report(y_test, pred_test_sm_rb))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.64      0.19      0.29       260

    accuracy                           0.87      1809
   macro avg       0.76      0.59      0.61      1809
weighted avg       0.84      0.87      0.84      1809



In [137]:
tp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][0]
tn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][1]
fp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][1]
fn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][0]

### Evaluation For Logistic Regression and Support Vector Classifier

In [138]:
distance_tuned = {
    "SVM_OS_Train": [svm_acc_tuned_train_os, svm_recall_tuned_train_os, svm_prec_tuned_train_os, svm_f1_tuned_train_os],
    "SVM_OS_Test" : [svm_acc_tuned_test_os, svm_recall_tuned_test_os, svm_prec_tuned_test_os, svm_f1_tuned_test_os],
    "SVM_SM_Train": [svm_acc_tuned_train_sm, svm_recall_tuned_train_sm, svm_prec_tuned_train_sm, svm_f1_tuned_train_sm],
    "SVM_SM_Test" : [svm_acc_tuned_test_sm, svm_recall_tuned_test_sm, svm_prec_tuned_test_sm, svm_f1_tuned_test_sm],
    "Logreg_OS_Train": [logreg_acc_tuned_train_os, logreg_recall_tuned_train_os, logreg_prec_tuned_train_os, logreg_f1_tuned_train_os],
    "Logreg_OS_Test" : [logreg_acc_tuned_test_os, logreg_recall_tuned_test_os, logreg_prec_tuned_test_os, logreg_f1_tuned_test_os],
    "Logreg_SM_Train": [logreg_acc_tuned_train_sm, logreg_recall_tuned_train_sm, logreg_prec_tuned_train_sm, logreg_f1_tuned_train_sm],
    "Logreg_SM_Test" : [logreg_acc_tuned_test_sm, logreg_recall_tuned_test_sm, logreg_prec_tuned_test_sm, logreg_f1_tuned_test_sm]
    }
tuned_matrix = pd.DataFrame(data = distance_tuned, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

dictance_tuned_os = {
    "SVM Standard_OS_Train": [svm_acc_tuned_train_os_std, svm_recall_tuned_train_os_std, svm_prec_tuned_train_os_std, svm_f1_tuned_train_os_std],
    "SVM Standard_OS_Test" : [svm_acc_tuned_test_os_std, svm_recall_tuned_test_os_std, svm_prec_tuned_test_os_std, svm_f1_tuned_test_os_std],
    "SVM MinMax_OS_Train": [svm_acc_tuned_train_os_mm, svm_recall_tuned_train_os_mm, svm_prec_tuned_train_os_mm, svm_f1_tuned_train_os_mm],
    "SVM MinMax_OS_Test" : [svm_acc_tuned_test_os_mm, svm_recall_tuned_test_os_mm, svm_prec_tuned_test_os_mm, svm_f1_tuned_test_os_mm],
    "SVM Robust_OS_Train": [svm_acc_tuned_train_os_rb, svm_recall_tuned_train_os_rb, svm_prec_tuned_train_os_rb, svm_f1_tuned_train_os_rb],
    "SVM Robust_OS_Test" : [svm_acc_tuned_test_os_rb, svm_recall_tuned_test_os_rb, svm_prec_tuned_test_os_rb, svm_f1_tuned_test_os_rb],
    "Logreg Standard_OS_Train": [logreg_acc_tuned_train_os_std, logreg_recall_tuned_train_os_std, logreg_prec_tuned_train_os_std, logreg_f1_tuned_train_os_std],
    "Logreg Standard_OS_Test" : [logreg_acc_tuned_test_os_std, logreg_recall_tuned_test_os_std, logreg_prec_tuned_test_os_std, logreg_f1_tuned_test_os_std],
    "Logreg MinMax_OS_Train": [logreg_acc_tuned_train_os_mm, logreg_recall_tuned_train_os_mm, logreg_prec_tuned_train_os_mm, logreg_f1_tuned_train_os_mm],
    "Logreg MinMax_OS_Test" : [logreg_acc_tuned_test_os_mm, logreg_recall_tuned_test_os_mm, logreg_prec_tuned_test_os_mm, logreg_f1_tuned_test_os_mm],
    "Logreg Robust_OS_Train": [logreg_acc_tuned_train_os_rb, logreg_recall_tuned_train_os_rb, logreg_prec_tuned_train_os_rb, logreg_f1_tuned_train_os_rb],
    "Logreg Robust_OS_Test" : [logreg_acc_tuned_test_os_rb, logreg_recall_tuned_test_os_rb, logreg_prec_tuned_test_os_rb, logreg_f1_tuned_test_os_rb]
    }
distance_tuned_os_matrix = pd.DataFrame(data = dictance_tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

distance_tuned_sm = {
    "SVM Standard_SM_Train": [svm_acc_tuned_train_sm_std, svm_recall_tuned_train_sm_std, svm_prec_tuned_train_sm_std, svm_f1_tuned_train_sm_std],
    "SVM Standard_SM_Test" : [svm_acc_tuned_test_sm_std, svm_recall_tuned_test_sm_std, svm_prec_tuned_test_sm_std, svm_f1_tuned_test_sm_std],
    "SVM MinMax_SM_Train": [svm_acc_tuned_train_sm_mm, svm_recall_tuned_train_sm_mm, svm_prec_tuned_train_sm_mm, svm_f1_tuned_train_sm_mm],
    "SVM MinMax_SM_Test" : [svm_acc_tuned_test_sm_mm, svm_recall_tuned_test_sm_mm, svm_prec_tuned_test_sm_mm, svm_f1_tuned_test_sm_mm],
    "SVM Robust_SM_Train": [svm_acc_tuned_train_sm_rb, svm_recall_tuned_train_sm_rb, svm_prec_tuned_train_sm_rb, svm_f1_tuned_train_sm_rb],
    "SVM Robust_SM_Test" : [svm_acc_tuned_test_sm_rb, svm_recall_tuned_test_sm_rb, svm_prec_tuned_test_sm_rb, svm_f1_tuned_test_sm_rb],
    "Logreg Standard_SM_Train": [logreg_acc_tuned_train_sm_std, logreg_recall_tuned_train_sm_std, logreg_prec_tuned_train_sm_std, logreg_f1_tuned_train_sm_std],
    "Logreg Standard_SM_Test" : [logreg_acc_tuned_test_sm_std, logreg_recall_tuned_test_sm_std, logreg_prec_tuned_test_sm_std, logreg_f1_tuned_test_sm_std],
    "Logreg MinMax_SM_Train": [logreg_acc_tuned_train_sm_mm, logreg_recall_tuned_train_sm_mm, logreg_prec_tuned_train_sm_mm, logreg_f1_tuned_train_sm_mm],
    "Logreg MinMax_SM_Test" : [logreg_acc_tuned_test_sm_mm, logreg_recall_tuned_test_sm_mm, logreg_prec_tuned_test_sm_mm, logreg_f1_tuned_test_sm_mm],
    "Logreg Robust_SM_Train": [logreg_acc_tuned_train_sm_rb, logreg_recall_tuned_train_sm_rb, logreg_prec_tuned_train_sm_rb, logreg_f1_tuned_train_sm_rb],
    "Logreg Robust_SM_Test" : [logreg_acc_tuned_test_sm_rb, logreg_recall_tuned_test_sm_rb, logreg_prec_tuned_test_sm_rb, logreg_f1_tuned_test_sm_rb]
    }
distance_tuned_sm_matrix = pd.DataFrame(data = distance_tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

In [139]:
tuned_matrix

Unnamed: 0,SVM_OS_Train,SVM_OS_Test,SVM_SM_Train,SVM_SM_Test,Logreg_OS_Train,Logreg_OS_Test,Logreg_SM_Train,Logreg_SM_Test
Accuracy,1.0,0.978441,0.992735,0.983969,0.742896,0.711443,0.909106,0.868436
Recall,1.0,0.85,0.997255,1.0,0.78463,0.734615,0.835809,0.184615
Precision,1.0,1.0,0.98832,0.899654,0.724184,0.296584,0.979379,0.648649
F1 Score,1.0,0.918919,0.992768,0.947177,0.753196,0.422566,0.901916,0.287425


In [140]:
distance_tuned_os_matrix

Unnamed: 0,SVM Standard_OS_Train,SVM Standard_OS_Test,SVM MinMax_OS_Train,SVM MinMax_OS_Test,SVM Robust_OS_Train,SVM Robust_OS_Test,Logreg Standard_OS_Train,Logreg Standard_OS_Test,Logreg MinMax_OS_Train,Logreg MinMax_OS_Test,Logreg Robust_OS_Train,Logreg Robust_OS_Test
Accuracy,0.999919,0.970149,0.999919,0.887783,0.999919,0.966833,0.743058,0.709784,0.742896,0.72471,0.742896,0.711443
Recall,1.0,0.819231,1.0,0.219231,1.0,0.807692,0.78463,0.746154,0.785115,0.703846,0.78463,0.734615
Precision,0.999839,0.968182,0.999839,1.0,0.999839,0.954545,0.7244,0.29709,0.723984,0.30298,0.724184,0.296584
F1 Score,0.999919,0.8875,0.999919,0.359621,0.999919,0.875,0.753313,0.424973,0.753311,0.423611,0.753196,0.422566


In [141]:
distance_tuned_sm_matrix

Unnamed: 0,SVM Standard_SM_Train,SVM Standard_SM_Test,SVM MinMax_SM_Train,SVM MinMax_SM_Test,SVM Robust_SM_Train,SVM Robust_SM_Test,Logreg Standard_SM_Train,Logreg Standard_SM_Test,Logreg MinMax_SM_Train,Logreg MinMax_SM_Test,Logreg Robust_SM_Train,Logreg Robust_SM_Test
Accuracy,1.0,0.973466,0.998224,0.91874,1.0,0.964069,0.909832,0.866224,0.908783,0.86733,0.909025,0.867883
Recall,1.0,0.873077,0.996448,0.496154,1.0,0.865385,0.8371,0.196154,0.835163,0.173077,0.83597,0.188462
Precision,1.0,0.938017,1.0,0.889655,1.0,0.882353,0.979596,0.607143,0.979364,0.642857,0.979013,0.636364
F1 Score,1.0,0.904382,0.998221,0.637037,1.0,0.873786,0.90276,0.296512,0.901534,0.272727,0.901855,0.290801


## Decision Tree

In [142]:
decision_tree = DecisionTreeClassifier()

In [143]:
param_dt =  {"criterion" : ['gini', 'entropy'],
             "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             "min_samples_leaf":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "max_features" : ['auto', 'sqrt', 'log2']}

### Random Over Sampling

In [144]:
dt_os = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
dt_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   33.6s finished


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [145]:
dt_tuned_os = dt_os.best_estimator_
pred_train_os = dt_tuned_os.predict(x_train_os)
pred_test_os = dt_tuned_os.predict(x_test)
dt_tuned_os

DecisionTreeClassifier(criterion='entropy', max_depth=45, max_features='sqrt',
                       min_samples_split=16)

In [146]:
dt_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
dt_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
dt_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
dt_recall_tuned_test_os = recall_score(y_test, pred_test_os)
dt_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
dt_prec_tuned_test_os = precision_score(y_test, pred_test_os)
dt_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
dt_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [147]:
cm_dt_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_dt_tuned_os = pd.DataFrame(data=cm_dt_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,160,100
Akt 0,89,1460


In [148]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1549
           1       0.64      0.62      0.63       260

    accuracy                           0.90      1809
   macro avg       0.79      0.78      0.78      1809
weighted avg       0.89      0.90      0.89      1809



In [149]:
tp_dt_os = cm_dt_tuned_os['Pred 1'][0]
tn_dt_os = cm_dt_tuned_os['Pred 0'][1]
fp_dt_os = cm_dt_tuned_os['Pred 1'][1]
fn_dt_os = cm_dt_tuned_os['Pred 0'][0]

### Smote

In [150]:
dt_sm = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
dt_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   26.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   26.5s finished


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [151]:
dt_tuned_sm = dt_sm.best_estimator_
pred_train_sm = dt_tuned_sm.predict(x_train_sm)
pred_test_sm = dt_tuned_sm.predict(x_test)
dt_tuned_sm

DecisionTreeClassifier(criterion='entropy', max_depth=49, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=4)

In [152]:
dt_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
dt_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
dt_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
dt_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
dt_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
dt_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
dt_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
dt_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [153]:
cm_dt_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_dt_tuned_sm = pd.DataFrame(data=cm_dt_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,119,141
Akt 0,177,1372


In [154]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      1549
           1       0.40      0.46      0.43       260

    accuracy                           0.82      1809
   macro avg       0.65      0.67      0.66      1809
weighted avg       0.83      0.82      0.83      1809



In [155]:
tp_dt_sm = cm_dt_tuned_sm['Pred 1'][0]
tn_dt_sm = cm_dt_tuned_sm['Pred 0'][1]
fp_dt_sm = cm_dt_tuned_sm['Pred 1'][1]
fn_dt_sm = cm_dt_tuned_sm['Pred 0'][0]

## Random Forest

In [156]:
random_forest = RandomForestClassifier()

In [157]:
param_rf =  {"n_estimators":np.arange(100, 2000),
             "criterion" : ['gini', 'entropy'],
             "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             "min_samples_leaf":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "max_features" : ['auto', 'sqrt', 'log2']}

### Random Over Sampling

In [158]:
rf_os = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
rf_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 48.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 86.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 96.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [159]:
rf_tuned_os = rf_os.best_estimator_
pred_train_os = rf_tuned_os.predict(x_train_os)
pred_test_os = rf_tuned_os.predict(x_test)
rf_tuned_os

RandomForestClassifier(max_depth=43, min_samples_split=6, n_estimators=933)

In [160]:
rf_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
rf_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
rf_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
rf_recall_tuned_test_os = recall_score(y_test, pred_test_os)
rf_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
rf_prec_tuned_test_os = precision_score(y_test, pred_test_os)
rf_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
rf_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [161]:
cm_rf_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_rf_tuned_os = pd.DataFrame(data=cm_rf_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,201,59
Akt 0,4,1545


In [162]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1549
           1       0.98      0.77      0.86       260

    accuracy                           0.97      1809
   macro avg       0.97      0.89      0.92      1809
weighted avg       0.97      0.97      0.96      1809



In [163]:
tp_rf_os = cm_rf_tuned_os['Pred 1'][0]
tn_rf_os = cm_rf_tuned_os['Pred 0'][1]
fp_rf_os = cm_rf_tuned_os['Pred 1'][1]
fn_rf_os = cm_rf_tuned_os['Pred 0'][0]

### Smote

In [164]:
rf_sm = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'f1')
rf_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 47.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 89.9min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 103.9min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [165]:
rf_tuned_sm = rf_sm.best_estimator_
pred_train_sm = rf_tuned_sm.predict(x_train_sm)
pred_test_sm = rf_tuned_sm.predict(x_test)
rf_tuned_sm

RandomForestClassifier(max_depth=19, max_features='log2', min_samples_split=7,
                       n_estimators=1893)

In [166]:
rf_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
rf_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
rf_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
rf_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
rf_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
rf_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
rf_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
rf_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [167]:
cm_rf_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_rf_tuned_sm = pd.DataFrame(data=cm_rf_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,87,173
Akt 0,16,1533


In [168]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1549
           1       0.84      0.33      0.48       260

    accuracy                           0.90      1809
   macro avg       0.87      0.66      0.71      1809
weighted avg       0.89      0.90      0.88      1809



In [169]:
tp_rf_sm = cm_rf_tuned_sm['Pred 1'][0]
tn_rf_sm = cm_rf_tuned_sm['Pred 0'][1]
fp_rf_sm = cm_rf_tuned_sm['Pred 1'][1]
fn_rf_sm = cm_rf_tuned_sm['Pred 0'][0]

## Evaluation Matrix For Decision Tree and Random Forest

In [170]:
tuned_os = {
    "DT OS Train": [dt_acc_tuned_train_os, dt_recall_tuned_train_os, dt_prec_tuned_train_os, dt_f1_tuned_train_os],
    "DT OS Test" : [dt_acc_tuned_test_os, dt_recall_tuned_test_os, dt_prec_tuned_test_os, dt_f1_tuned_test_os],
    "RF OS Train": [rf_acc_tuned_train_os, rf_recall_tuned_train_os, rf_prec_tuned_train_os, rf_f1_tuned_train_os],
    "RF OS Test" : [rf_acc_tuned_test_os, rf_recall_tuned_test_os, rf_prec_tuned_test_os, rf_f1_tuned_test_os]}

tuned_os_matrix = pd.DataFrame(data = tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

tuned_sm = {
    "DT SM Train": [dt_acc_tuned_train_sm, dt_recall_tuned_train_sm, dt_prec_tuned_train_sm, dt_f1_tuned_train_sm],
    "DT SM Test" : [dt_acc_tuned_test_sm, dt_recall_tuned_test_sm, dt_prec_tuned_test_sm, dt_f1_tuned_test_sm],
    "RF SM Train": [rf_acc_tuned_train_sm, rf_recall_tuned_train_sm, rf_prec_tuned_train_sm, rf_f1_tuned_train_sm],
    "RF SM Test" : [rf_acc_tuned_test_sm, rf_recall_tuned_test_sm, rf_prec_tuned_test_sm, rf_f1_tuned_test_sm]}

tuned_sm_matrix = pd.DataFrame(data = tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])


In [171]:
tuned_os_matrix

Unnamed: 0,DT OS Train,DT OS Test,RF OS Train,RF OS Test
Accuracy,0.988457,0.895522,1.0,0.965174
Recall,0.995157,0.615385,1.0,0.773077
Precision,0.981998,0.64257,1.0,0.980488
F1 Score,0.988533,0.628684,1.0,0.864516


In [172]:
tuned_sm_matrix

Unnamed: 0,DT SM Train,DT SM Test,RF SM Train,RF SM Test
Accuracy,0.964078,0.824212,0.994914,0.895522
Recall,0.956087,0.457692,0.989829,0.334615
Precision,0.971616,0.402027,1.0,0.84466
F1 Score,0.963789,0.428058,0.994888,0.479339


In [173]:
cm = {
    "True Positive" : [tp_svm_os, tp_svm_os_std, tp_svm_os_mm, tp_svm_os_rb, 
                       tp_svm_sm, tp_svm_sm_std, tp_svm_sm_mm, tp_svm_sm_rb,
                       tp_logreg_os, tp_logreg_os_std, tp_logreg_os_mm, tp_logreg_os_rb, 
                       tp_logreg_sm, tp_logreg_sm_std, tp_logreg_sm_mm, tp_logreg_sm_rb,
                       tp_dt_os, tp_dt_sm, tp_rf_os, tp_rf_sm],
    
    "True Negative" : [tn_svm_os, tn_svm_os_std, tn_svm_os_mm, tn_svm_os_rb, 
                       tn_svm_sm, tn_svm_sm_std, tn_svm_sm_mm, tn_svm_sm_rb,
                       tn_logreg_os, tn_logreg_os_std, tn_logreg_os_mm, tn_logreg_os_rb, 
                       tn_logreg_sm, tn_logreg_sm_std, tn_logreg_sm_mm, tn_logreg_sm_rb,
                       tn_dt_os, tn_dt_sm, tn_rf_os, tn_rf_sm],
    
    "False Positive": [fp_svm_os, fp_svm_os_std, fp_svm_os_mm, fp_svm_os_rb, 
                       fp_svm_sm, fp_svm_sm_std, fp_svm_sm_mm, fp_svm_sm_rb,
                       fp_logreg_os, fp_logreg_os_std, fp_logreg_os_mm, fp_logreg_os_rb, 
                       fp_logreg_sm, fp_logreg_sm_std, fp_logreg_sm_mm, fp_logreg_sm_rb,
                       fp_dt_os, fp_dt_sm, fp_rf_os, fp_rf_sm],
    
    "False Negative": [fn_svm_os, fn_svm_os_std, fn_svm_os_mm, fn_svm_os_rb, 
                       fn_svm_sm, fn_svm_sm_std, fn_svm_sm_mm, fn_svm_sm_rb,
                       fn_logreg_os, fn_logreg_os_std, fn_logreg_os_mm, fn_logreg_os_rb, 
                       fn_logreg_sm, fn_logreg_sm_std, fn_logreg_sm_mm, fn_logreg_sm_rb,
                       fn_dt_os, fn_dt_sm, fn_rf_os, fn_rf_sm]
}
    
cm_matrix = pd.DataFrame(data = cm, index = ['SVM OS', 'SVM OS Standard', 'SVM OS MinMax', 'SVM OS Robust',
                                             'SVM SM', 'SVM SM Standard', 'SVM SM MinMax', 'SVM SM Robust',
                                             'LogReg OS', 'Logreg OS Standard', 'Logreg OS MinMax', 'Logreg OS Robust',
                                             'LogReg SM', 'Logreg SM Standard', 'Logreg SM MinMax', 'Logreg SM Robust',
                                             'Decision Tree OS', 'Decision Tree SM', 'Random Forest OS', 'Random Forest SM'])
cm_matrix.sort_values('False Negative')

Unnamed: 0,True Positive,True Negative,False Positive,False Negative
SVM SM,260,1520,29,0
SVM SM Standard,227,1534,15,33
SVM SM Robust,225,1519,30,35
SVM OS,221,1549,0,39
SVM OS Standard,213,1542,7,47
SVM OS Robust,210,1539,10,50
Random Forest OS,201,1545,4,59
Logreg OS Standard,194,1090,459,66
LogReg OS,191,1096,453,69
Logreg OS Robust,191,1096,453,69


In [174]:
import joblib

In [178]:
import joblib
joblib.dump(rf_os, 'Support Vector Machine with Smote')

['Support Vector Machine with Smote']

In [179]:
x = rf_os.predict_proba([[3826, 0, 2, 1, 25000, 54, 2, 5200, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])
x

array([[0.67817588, 0.32182412]])

In [177]:
y = rf_os.predict([[3826, 0, 2, 1, 25000, 54, 2, 5200, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])
y

array([0], dtype=int64)