In [202]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## Dataset

In [203]:
df_processed = pd.read_csv("data_ml.csv", index_col = 0)
df_processed.head()

Unnamed: 0_level_0,Customer Lifetime Value,Response,Coverage,Education,Gender,Income,Monthly Premium Auto,Number of Policies,Total Claim Amount,Vehicle Size,...,Sales Channel_Agent,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,2763.519279,0,0,2,1,56274,69,1,384.811147,1,...,1,0,0,0,0,0,0,0,0,1
QZ44356,6979.535903,0,1,2,1,0,94,8,1131.464935,1,...,1,0,0,0,1,0,0,0,0,0
AI49188,12887.43165,0,2,2,1,48767,108,2,566.472247,1,...,1,0,0,0,0,0,0,0,0,1
WW63253,7645.861827,0,0,2,0,0,106,7,529.881344,1,...,0,0,1,0,0,0,0,1,0,0
HB64268,2813.692575,0,0,2,0,43836,73,1,138.130879,1,...,1,0,0,0,1,0,0,0,0,0


# Splitting Data

In [204]:
x = df_processed.drop(columns = ['Response'] , axis = 1)
y = df_processed['Response']

In [205]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, stratify= y, random_state = 42)

# Handling Imbalance Data

In [206]:
y_train.value_counts()

0    6194
1    1042
Name: Response, dtype: int64

#### Random Over Sampling

In [207]:
df_train = pd.concat([x_train, y_train], axis=1)
not_renewal = df_train[df_train['Response'] == 0]
renewal = df_train[df_train['Response'] == 1]

renewal_oversample = resample(renewal, replace=True, n_samples = len(not_renewal), random_state = 42)
df_OverSampled = pd.concat([not_renewal, renewal_oversample])
df_OverSampled['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [208]:
x_train_os = df_OverSampled.drop(columns = ['Response'])
y_train_os = df_OverSampled['Response']

#### Smote

In [209]:
sm = SMOTE(random_state=42)
x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

x = pd.DataFrame(data = x_train_sm, columns = x_train.columns)
y = pd.DataFrame(data = y_train_sm, columns = ['Response'])
df_smote = x.join(y)
# df_smote = pd.concat([x_train_sm, y_train_sm], axis = 1)
df_smote['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [210]:
columns_continuous = ['Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Policies', 'Total Claim Amount']

In [211]:
std_scale = StandardScaler()
mm_scale = MinMaxScaler()
rb_scale = RobustScaler()

# Tuning Model

## SVM

In [233]:
svm = SVC(max_iter = 1000)

In [234]:
param_svm = {'C' : np.linspace(0.00001, 10, 1000),
             'kernel' : ['linear', 'rbf'],
             "gamma" : np.arange(0.0001, 100)}

#### Random Over Sampling without Scaling

In [235]:
svm_os = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
svm_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.4min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [241]:
svm_os_tuned = svm_os.best_estimator_
pred_train_os = svm_os_tuned.predict(x_train_os)
pred_test_os = svm_os_tuned.predict(x_test)
svm_os_tuned

SVC(C=5.2752799999999995, gamma=1.0001, max_iter=1000)

In [242]:
svm_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
svm_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
svm_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
svm_recall_tuned_test_os = recall_score(y_test, pred_test_os)
svm_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
svm_prec_tuned_test_os = precision_score(y_test, pred_test_os)
svm_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
svm_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [243]:
cm_svm_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_svm_tuned_os = pd.DataFrame(data=cm_svm_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,213,47
Akt 0,7,1542


In [244]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1549
           1       0.97      0.82      0.89       260

    accuracy                           0.97      1809
   macro avg       0.97      0.91      0.94      1809
weighted avg       0.97      0.97      0.97      1809



In [245]:
tp_svm_os = cm_svm_tuned_os['Pred 1'][0]
tn_svm_os = cm_svm_tuned_os['Pred 0'][1]
fp_svm_os = cm_svm_tuned_os['Pred 1'][1]
fn_svm_os = cm_svm_tuned_os['Pred 0'][0]

#### Smote without Scaling

In [246]:
svm_sm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
svm_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.6min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [247]:
svm_sm_tuned = svm_sm.best_estimator_
pred_train_sm = svm_sm_tuned.predict(x_train_sm)
pred_test_sm = svm_sm_tuned.predict(x_test)
svm_sm_tuned

SVC(C=0.13014, gamma=88.0001, max_iter=1000)

In [248]:
svm_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
svm_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
svm_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
svm_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
svm_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
svm_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
svm_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
svm_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [249]:
cm_svm_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_svm_tuned_sm = pd.DataFrame(data=cm_svm_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,260,0
Akt 0,1549,0


In [250]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1549
           1       0.14      1.00      0.25       260

    accuracy                           0.14      1809
   macro avg       0.07      0.50      0.13      1809
weighted avg       0.02      0.14      0.04      1809



In [251]:
tp_svm_sm = cm_svm_tuned_sm['Pred 1'][0]
tn_svm_sm = cm_svm_tuned_sm['Pred 0'][1]
fp_svm_sm = cm_svm_tuned_sm['Pred 1'][1]
fn_svm_sm = cm_svm_tuned_sm['Pred 0'][0]

#### Random Over Sampling with Standard Scaler

In [252]:
svm_os_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
svm_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.3min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [253]:
svm_os_std_tuned = svm_os_std.best_estimator_
pred_train_os_std = svm_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = svm_os_std_tuned.predict(x_test)
svm_os_std_tuned

SVC(C=7.41742, gamma=1.0001, max_iter=1000)

In [254]:
svm_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
svm_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)
svm_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
svm_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)
svm_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
svm_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)
svm_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
svm_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [255]:
cm_svm_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_svm_tuned_os_std = pd.DataFrame(data=cm_svm_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,213,47
Akt 0,7,1542


In [256]:
print(classification_report(y_test, pred_test_os_std))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1549
           1       0.97      0.82      0.89       260

    accuracy                           0.97      1809
   macro avg       0.97      0.91      0.94      1809
weighted avg       0.97      0.97      0.97      1809



In [257]:
tp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][0]
tn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][1]
fp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][1]
fn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][0]

#### Random Over Sampling with MinMax Scaler

In [258]:
svm_os_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
svm_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.1min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [259]:
svm_os_mm_tuned = svm_os_mm.best_estimator_
pred_train_os_mm = svm_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = svm_os_mm_tuned.predict(x_test)
svm_os_mm_tuned

SVC(C=9.04905, gamma=1.0001, max_iter=1000)

In [260]:
svm_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
svm_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
svm_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
svm_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)
svm_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
svm_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)
svm_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
svm_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [261]:
cm_svm_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_svm_tuned_os_mm = pd.DataFrame(data=cm_svm_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,188,72
Akt 0,35,1514


In [262]:
print(classification_report(y_test, pred_test_os_mm))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1549
           1       0.84      0.72      0.78       260

    accuracy                           0.94      1809
   macro avg       0.90      0.85      0.87      1809
weighted avg       0.94      0.94      0.94      1809



In [263]:
tp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][0]
tn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][1]
fp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][1]
fn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][0]

#### Random Over Sampling with Robust Scaler

In [264]:
svm_os_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
svm_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 14.2min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [265]:
svm_os_rb_tuned = svm_os_rb.best_estimator_
pred_train_os_rb = svm_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = svm_os_rb_tuned.predict(x_test)
svm_os_rb_tuned

SVC(C=4.3743799999999995, gamma=1.0001, max_iter=1000)

In [266]:
svm_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
svm_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
svm_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
svm_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)
svm_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
svm_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)
svm_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
svm_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [267]:
cm_svm_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_svm_tuned_os_rb = pd.DataFrame(data=cm_svm_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,210,50
Akt 0,10,1539


In [268]:
print(classification_report(y_test, pred_test_os_rb))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1549
           1       0.95      0.81      0.88       260

    accuracy                           0.97      1809
   macro avg       0.96      0.90      0.93      1809
weighted avg       0.97      0.97      0.97      1809



In [269]:
tp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][0]
tn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][1]
fp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][1]
fn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][0]

#### Smote with Standard Scaler

In [270]:
svm_sm_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.2min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [271]:
svm_sm_std_tuned = svm_sm_std.best_estimator_
pred_train_sm_std = svm_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = svm_sm_std_tuned.predict(x_test)
svm_sm_std_tuned

SVC(C=0.72073, gamma=45.0001, max_iter=1000)

In [272]:
svm_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
svm_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
svm_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
svm_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)
svm_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
svm_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)
svm_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
svm_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [273]:
cm_svm_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_svm_tuned_sm_std = pd.DataFrame(data=cm_svm_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,257,3
Akt 0,550,999


In [274]:
print(classification_report(y_test, pred_test_sm_std))

              precision    recall  f1-score   support

           0       1.00      0.64      0.78      1549
           1       0.32      0.99      0.48       260

    accuracy                           0.69      1809
   macro avg       0.66      0.82      0.63      1809
weighted avg       0.90      0.69      0.74      1809



In [275]:
tp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][0]
tn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][1]
fp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][1]
fn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][0]

#### Smote with MinMax Scaler

In [276]:
svm_sm_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.5min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [277]:
svm_sm_mm_tuned = svm_sm_mm.best_estimator_
pred_train_sm_mm = svm_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = svm_sm_mm_tuned.predict(x_test)
svm_sm_mm_tuned

SVC(C=1.3213300000000001, gamma=85.0001, max_iter=1000)

In [278]:
svm_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
svm_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
svm_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
svm_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
svm_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
svm_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
svm_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
svm_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [279]:
cm_svm_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_svm_tuned_sm_mm = pd.DataFrame(data=cm_svm_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,260,0
Akt 0,1546,3


In [280]:
print(classification_report(y_test, pred_test_sm_mm))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00      1549
           1       0.14      1.00      0.25       260

    accuracy                           0.15      1809
   macro avg       0.57      0.50      0.13      1809
weighted avg       0.88      0.15      0.04      1809



In [281]:
tp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][0]
tn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][1]
fp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][1]
fn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][0]

#### Smote with Robust Scaler

In [282]:
svm_sm_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])
svm_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 12.0min finished


RandomizedSearchCV(cv=3, estimator=SVC(max_iter=1000), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2.00...
       7.50001e+01, 7.60001e+01, 7.70001e+01, 7.80001e+01, 7.90001e+01,
       8.00001e+01, 8.10001e+01, 8.20001e+01, 8.30001e+01, 8.40001e+01,
       8.50001e+01, 8.60001e+01, 8.70001e+01, 8.80001e+01, 8.90001e+01,
       9.00001e+01, 9.10001e+01, 9.20001e+01, 9.30001e+01, 9.40001e+01,
       9.50001e+01, 9.60001e+01, 9.70001e+01, 9.80001e+01, 9.90001e+01]),
                                        'kernel': ['linear', 'rbf']},
                   scoring='recall', verbose=1)

In [283]:
svm_sm_rb_tuned = svm_sm_rb.best_estimator_
pred_train_sm_rb = svm_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = svm_sm_rb_tuned.predict(x_test)
svm_sm_rb_tuned

SVC(C=0.22023, gamma=94.0001, max_iter=1000)

In [284]:
svm_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
svm_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
svm_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
svm_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
svm_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
svm_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
svm_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
svm_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [285]:
cm_svm_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_svm_tuned_sm_rb = pd.DataFrame(data=cm_svm_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,255,5
Akt 0,666,883


In [286]:
print(classification_report(y_test, pred_test_sm_rb))

              precision    recall  f1-score   support

           0       0.99      0.57      0.72      1549
           1       0.28      0.98      0.43       260

    accuracy                           0.63      1809
   macro avg       0.64      0.78      0.58      1809
weighted avg       0.89      0.63      0.68      1809



In [287]:
tp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][0]
tn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][1]
fp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][1]
fn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][0]

## Logistic Regression

In [288]:
logreg = LogisticRegression()

In [289]:
param_logreg = {'C' : np.linspace(0.00001, 10, 1000),
                'penalty' : ['l1', 'l2', 'elasticnet', None],
                'class_weight': [None, 'weight'],
                'fit_intercept' : [True, False]}

#### Random Over Sampling without Scaling

In [290]:
logreg_os = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
logreg_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   36.8s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [291]:
logreg_os_tuned = logreg_os.best_estimator_
pred_train_os = logreg_os_tuned.predict(x_train_os)
pred_test_os = logreg_os_tuned.predict(x_test)
logreg_os_tuned

LogisticRegression(C=0.20021)

In [292]:
logreg_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
logreg_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
logreg_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
logreg_recall_tuned_test_os = recall_score(y_test, pred_test_os)
logreg_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
logreg_prec_tuned_test_os = precision_score(y_test, pred_test_os)
logreg_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
logreg_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [293]:
cm_logreg_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_logreg_tuned_os = pd.DataFrame(data=cm_logreg_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,191,69
Akt 0,456,1093


In [294]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1549
           1       0.30      0.73      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.61      1809
weighted avg       0.85      0.71      0.75      1809



In [295]:
tp_logreg_os = cm_logreg_tuned_os['Pred 1'][0]
tn_logreg_os = cm_logreg_tuned_os['Pred 0'][1]
fp_logreg_os = cm_logreg_tuned_os['Pred 1'][1]
fn_logreg_os = cm_logreg_tuned_os['Pred 0'][0]

#### Smote Sampling without Scaling

In [296]:
logreg_sm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
logreg_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 755 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   27.8s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [297]:
logreg_sm_tuned = logreg_sm.best_estimator_
pred_train_sm = logreg_sm_tuned.predict(x_train_sm)
pred_test_sm = logreg_sm_tuned.predict(x_test)
logreg_sm_tuned

LogisticRegression(C=0.6406499999999999, class_weight='weight')

In [298]:
logreg_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
logreg_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
logreg_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
logreg_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
logreg_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
logreg_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
logreg_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
logreg_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [299]:
cm_logreg_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_logreg_tuned_sm = pd.DataFrame(data=cm_logreg_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,49,211
Akt 0,34,1515


In [300]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.59      0.19      0.29       260

    accuracy                           0.86      1809
   macro avg       0.73      0.58      0.61      1809
weighted avg       0.84      0.86      0.83      1809



In [301]:
tp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][0]
tn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][1]
fp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][1]
fn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][0]

#### Standard Scaling with Random Over Sampling

In [302]:
logreg_os_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   28.9s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [303]:
logreg_os_std_tuned = logreg_os_std.best_estimator_
pred_train_os_std = logreg_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = logreg_os_std_tuned.predict(x_test)
logreg_os_std_tuned

LogisticRegression(C=0.13014, fit_intercept=False)

In [304]:
logreg_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
logreg_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)
logreg_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
logreg_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)
logreg_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
logreg_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)
logreg_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
logreg_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [305]:
cm_logreg_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_logreg_tuned_os_std = pd.DataFrame(data=cm_logreg_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,194,66
Akt 0,459,1090


In [306]:
print(classification_report(y_test, pred_test_os_std))

              precision    recall  f1-score   support

           0       0.94      0.70      0.81      1549
           1       0.30      0.75      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.62      1809
weighted avg       0.85      0.71      0.75      1809



In [307]:
tp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][0]
tn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][1]
fp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][1]
fn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][0]

#### MinMax Scaling with Random Over Sampling

In [308]:
logreg_os_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   35.2s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [309]:
logreg_os_mm_tuned = logreg_os_mm.best_estimator_
pred_train_os_mm = logreg_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = logreg_os_mm_tuned.predict(x_test)
logreg_os_mm_tuned

LogisticRegression(C=4.0740799999999995, fit_intercept=False)

In [310]:
logreg_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
logreg_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
logreg_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
logreg_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)
logreg_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
logreg_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)
logreg_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
logreg_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [311]:
cm_logreg_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_logreg_tuned_os_mm = pd.DataFrame(data=cm_logreg_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,183,77
Akt 0,423,1126


In [312]:
print(classification_report(y_test, pred_test_os_mm))

              precision    recall  f1-score   support

           0       0.94      0.73      0.82      1549
           1       0.30      0.70      0.42       260

    accuracy                           0.72      1809
   macro avg       0.62      0.72      0.62      1809
weighted avg       0.84      0.72      0.76      1809



In [313]:
tp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][0]
tn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][1]
fp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][1]
fn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][0]

#### Robust Scaling with Random Over Sampling

In [314]:
logreg_os_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
logreg_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   24.1s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [315]:
logreg_os_rb_tuned = logreg_os_rb.best_estimator_
pred_train_os_rb = logreg_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = logreg_os_rb_tuned.predict(x_test)
logreg_os_rb_tuned

LogisticRegression(C=0.5505599999999999, fit_intercept=False)

In [316]:
logreg_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
logreg_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
logreg_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
logreg_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)
logreg_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
logreg_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)
logreg_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
logreg_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [317]:
cm_logreg_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_logreg_tuned_os_rb = pd.DataFrame(data=cm_logreg_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,190,70
Akt 0,452,1097


In [318]:
print(classification_report(y_test, pred_test_os_rb))

              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1549
           1       0.30      0.73      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.61      1809
weighted avg       0.85      0.71      0.75      1809



In [319]:
tp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][0]
tn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][1]
fp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][1]
fn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][0]

#### Standard Scaling with Smote

In [320]:
logreg_sm_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 893 out of 900 | elapsed:   25.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   25.4s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [321]:
logreg_sm_std_tuned = logreg_sm_std.best_estimator_
pred_train_sm_std = logreg_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = logreg_sm_std_tuned.predict(x_test)
logreg_sm_std_tuned

LogisticRegression(C=5.3353399999999995, class_weight='weight')

In [322]:
logreg_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
logreg_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
logreg_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
logreg_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)
logreg_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
logreg_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)
logreg_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
logreg_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [323]:
cm_logreg_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_logreg_tuned_sm_std = pd.DataFrame(data=cm_logreg_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,50,210
Akt 0,34,1515


In [324]:
print(classification_report(y_test, pred_test_sm_std))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.60      0.19      0.29       260

    accuracy                           0.87      1809
   macro avg       0.74      0.59      0.61      1809
weighted avg       0.84      0.87      0.83      1809



In [325]:
tp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][0]
tn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][1]
fp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][1]
fn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][0]

#### MinMax Scaling with Smote

In [326]:
logreg_sm_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   26.7s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [327]:
logreg_sm_mm_tuned = logreg_sm_mm.best_estimator_
pred_train_sm_mm = logreg_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = logreg_sm_mm_tuned.predict(x_test)
logreg_sm_mm_tuned

LogisticRegression(C=0.05006)

In [328]:
logreg_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
logreg_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
logreg_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
logreg_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
logreg_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
logreg_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
logreg_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
logreg_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [329]:
cm_logreg_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_logreg_tuned_sm_mm = pd.DataFrame(data=cm_logreg_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,68,192
Akt 0,69,1480


In [330]:
print(classification_report(y_test, pred_test_sm_mm))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1549
           1       0.50      0.26      0.34       260

    accuracy                           0.86      1809
   macro avg       0.69      0.61      0.63      1809
weighted avg       0.83      0.86      0.84      1809



In [331]:
tp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][0]
tn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][1]
fp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][1]
fn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][0]

#### Robust Scaling with Smote

In [332]:
logreg_sm_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])
logreg_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   27.8s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=300, n_jobs=-1,
                   param_distributions={'C': array([1.00000e-05, 1.00200e-02, 2.00300e-02, 3.00400e-02, 4.00500e-02,
       5.00600e-02, 6.00700e-02, 7.00800e-02, 8.00900e-02, 9.01000e-02,
       1.00110e-01, 1.10120e-01, 1.20130e-01, 1.30140e-01, 1.40150e-01,
       1.50160e-01, 1.60170e-01, 1.70180e-01, 1.80190e-01, 1.90200e-01,
       2....
       9.80981e+00, 9.81982e+00, 9.82983e+00, 9.83984e+00, 9.84985e+00,
       9.85986e+00, 9.86987e+00, 9.87988e+00, 9.88989e+00, 9.89990e+00,
       9.90991e+00, 9.91992e+00, 9.92993e+00, 9.93994e+00, 9.94995e+00,
       9.95996e+00, 9.96997e+00, 9.97998e+00, 9.98999e+00, 1.00000e+01]),
                                        'class_weight': [None, 'weight'],
                                        'fit_intercept': [True, False],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None]},


In [333]:
logreg_sm_rb_tuned = logreg_sm_rb.best_estimator_
pred_train_sm_rb = logreg_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = logreg_sm_rb_tuned.predict(x_test)
logreg_sm_rb_tuned

LogisticRegression(C=2.15216)

In [334]:
logreg_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
logreg_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
logreg_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
logreg_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
logreg_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
logreg_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
logreg_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
logreg_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [335]:
cm_logreg_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_logreg_tuned_sm_rb = pd.DataFrame(data=cm_logreg_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,49,211
Akt 0,29,1520


In [336]:
print(classification_report(y_test, pred_test_sm_rb))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.63      0.19      0.29       260

    accuracy                           0.87      1809
   macro avg       0.75      0.58      0.61      1809
weighted avg       0.84      0.87      0.84      1809



In [337]:
tp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][0]
tn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][1]
fp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][1]
fn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][0]

### Evaluation For Logistic Regression and Support Vector Classifier

In [338]:
distance_tuned = {
    "SVM_OS_Train": [svm_acc_tuned_train_os, svm_recall_tuned_train_os, svm_prec_tuned_train_os, svm_f1_tuned_train_os],
    "SVM_OS_Test" : [svm_acc_tuned_test_os, svm_recall_tuned_test_os, svm_prec_tuned_test_os, svm_f1_tuned_test_os],
    "SVM_SM_Train": [svm_acc_tuned_train_sm, svm_recall_tuned_train_sm, svm_prec_tuned_train_sm, svm_f1_tuned_train_sm],
    "SVM_SM_Test" : [svm_acc_tuned_test_sm, svm_recall_tuned_test_sm, svm_prec_tuned_test_sm, svm_f1_tuned_test_sm],
    "Logreg_OS_Train": [logreg_acc_tuned_train_os, logreg_recall_tuned_train_os, logreg_prec_tuned_train_os, logreg_f1_tuned_train_os],
    "Logreg_OS_Test" : [logreg_acc_tuned_test_os, logreg_recall_tuned_test_os, logreg_prec_tuned_test_os, logreg_f1_tuned_test_os],
    "Logreg_SM_Train": [logreg_acc_tuned_train_sm, logreg_recall_tuned_train_sm, logreg_prec_tuned_train_sm, logreg_f1_tuned_train_sm],
    "Logreg_SM_Test" : [logreg_acc_tuned_test_sm, logreg_recall_tuned_test_sm, logreg_prec_tuned_test_sm, logreg_f1_tuned_test_sm]
    }
tuned_matrix = pd.DataFrame(data = distance_tuned, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

dictance_tuned_os = {
    "SVM Standard_OS_Train": [svm_acc_tuned_train_os_std, svm_recall_tuned_train_os_std, svm_prec_tuned_train_os_std, svm_f1_tuned_train_os_std],
    "SVM Standard_OS_Test" : [svm_acc_tuned_test_os_std, svm_recall_tuned_test_os_std, svm_prec_tuned_test_os_std, svm_f1_tuned_test_os_std],
    "SVM MinMax_OS_Train": [svm_acc_tuned_train_os_mm, svm_recall_tuned_train_os_mm, svm_prec_tuned_train_os_mm, svm_f1_tuned_train_os_mm],
    "SVM MinMax_OS_Test" : [svm_acc_tuned_test_os_mm, svm_recall_tuned_test_os_mm, svm_prec_tuned_test_os_mm, svm_f1_tuned_test_os_mm],
    "SVM Robust_OS_Train": [svm_acc_tuned_train_os_rb, svm_recall_tuned_train_os_rb, svm_prec_tuned_train_os_rb, svm_f1_tuned_train_os_rb],
    "SVM Robust_OS_Test" : [svm_acc_tuned_test_os_rb, svm_recall_tuned_test_os_rb, svm_prec_tuned_test_os_rb, svm_f1_tuned_test_os_rb],
    "Logreg Standard_OS_Train": [logreg_acc_tuned_train_os_std, logreg_recall_tuned_train_os_std, logreg_prec_tuned_train_os_std, logreg_f1_tuned_train_os_std],
    "Logreg Standard_OS_Test" : [logreg_acc_tuned_test_os_std, logreg_recall_tuned_test_os_std, logreg_prec_tuned_test_os_std, logreg_f1_tuned_test_os_std],
    "Logreg MinMax_OS_Train": [logreg_acc_tuned_train_os_mm, logreg_recall_tuned_train_os_mm, logreg_prec_tuned_train_os_mm, logreg_f1_tuned_train_os_mm],
    "Logreg MinMax_OS_Test" : [logreg_acc_tuned_test_os_mm, logreg_recall_tuned_test_os_mm, logreg_prec_tuned_test_os_mm, logreg_f1_tuned_test_os_mm],
    "Logreg Robust_OS_Train": [logreg_acc_tuned_train_os_rb, logreg_recall_tuned_train_os_rb, logreg_prec_tuned_train_os_rb, logreg_f1_tuned_train_os_rb],
    "Logreg Robust_OS_Test" : [logreg_acc_tuned_test_os_rb, logreg_recall_tuned_test_os_rb, logreg_prec_tuned_test_os_rb, logreg_f1_tuned_test_os_rb]
    }
distance_tuned_os_matrix = pd.DataFrame(data = dictance_tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

distance_tuned_sm = {
    "SVM Standard_SM_Train": [svm_acc_tuned_train_sm_std, svm_recall_tuned_train_sm_std, svm_prec_tuned_train_sm_std, svm_f1_tuned_train_sm_std],
    "SVM Standard_SM_Test" : [svm_acc_tuned_test_sm_std, svm_recall_tuned_test_sm_std, svm_prec_tuned_test_sm_std, svm_f1_tuned_test_sm_std],
    "SVM MinMax_SM_Train": [svm_acc_tuned_train_sm_mm, svm_recall_tuned_train_sm_mm, svm_prec_tuned_train_sm_mm, svm_f1_tuned_train_sm_mm],
    "SVM MinMax_SM_Test" : [svm_acc_tuned_test_sm_mm, svm_recall_tuned_test_sm_mm, svm_prec_tuned_test_sm_mm, svm_f1_tuned_test_sm_mm],
    "SVM Robust_SM_Train": [svm_acc_tuned_train_sm_rb, svm_recall_tuned_train_sm_rb, svm_prec_tuned_train_sm_rb, svm_f1_tuned_train_sm_rb],
    "SVM Robust_SM_Test" : [svm_acc_tuned_test_sm_rb, svm_recall_tuned_test_sm_rb, svm_prec_tuned_test_sm_rb, svm_f1_tuned_test_sm_rb],
    "Logreg Standard_SM_Train": [logreg_acc_tuned_train_sm_std, logreg_recall_tuned_train_sm_std, logreg_prec_tuned_train_sm_std, logreg_f1_tuned_train_sm_std],
    "Logreg Standard_SM_Test" : [logreg_acc_tuned_test_sm_std, logreg_recall_tuned_test_sm_std, logreg_prec_tuned_test_sm_std, logreg_f1_tuned_test_sm_std],
    "Logreg MinMax_SM_Train": [logreg_acc_tuned_train_sm_mm, logreg_recall_tuned_train_sm_mm, logreg_prec_tuned_train_sm_mm, logreg_f1_tuned_train_sm_mm],
    "Logreg MinMax_SM_Test" : [logreg_acc_tuned_test_sm_mm, logreg_recall_tuned_test_sm_mm, logreg_prec_tuned_test_sm_mm, logreg_f1_tuned_test_sm_mm],
    "Logreg Robust_SM_Train": [logreg_acc_tuned_train_sm_rb, logreg_recall_tuned_train_sm_rb, logreg_prec_tuned_train_sm_rb, logreg_f1_tuned_train_sm_rb],
    "Logreg Robust_SM_Test" : [logreg_acc_tuned_test_sm_rb, logreg_recall_tuned_test_sm_rb, logreg_prec_tuned_test_sm_rb, logreg_f1_tuned_test_sm_rb]
    }
distance_tuned_sm_matrix = pd.DataFrame(data = distance_tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

In [339]:
tuned_matrix

Unnamed: 0,SVM_OS_Train,SVM_OS_Test,SVM_SM_Train,SVM_SM_Test,Logreg_OS_Train,Logreg_OS_Test,Logreg_SM_Train,Logreg_SM_Test
Accuracy,0.999919,0.970149,0.580723,0.143726,0.741201,0.709784,0.908056,0.864566
Recall,1.0,0.819231,1.0,1.0,0.780917,0.734615,0.835325,0.188462
Precision,0.999839,0.968182,0.543906,0.143726,0.723452,0.295209,0.977517,0.590361
F1 Score,0.999919,0.8875,0.704584,0.251329,0.751087,0.421169,0.900844,0.285714


In [340]:
distance_tuned_os_matrix

Unnamed: 0,SVM Standard_OS_Train,SVM Standard_OS_Test,SVM MinMax_OS_Train,SVM MinMax_OS_Test,SVM Robust_OS_Train,SVM Robust_OS_Test,Logreg Standard_OS_Train,Logreg Standard_OS_Test,Logreg MinMax_OS_Train,Logreg MinMax_OS_Test,Logreg Robust_OS_Train,Logreg Robust_OS_Test
Accuracy,0.999919,0.970149,0.999516,0.940851,0.999919,0.966833,0.742816,0.709784,0.742816,0.723604,0.742896,0.711443
Recall,1.0,0.819231,1.0,0.723077,1.0,0.807692,0.78463,0.746154,0.785115,0.703846,0.78463,0.730769
Precision,0.999839,0.968182,0.999032,0.843049,0.999839,0.954545,0.724076,0.29709,0.723876,0.30198,0.724184,0.29595
F1 Score,0.999919,0.8875,0.999516,0.778468,0.999919,0.875,0.753138,0.424973,0.753253,0.422633,0.753196,0.421286


In [341]:
distance_tuned_sm_matrix

Unnamed: 0,SVM Standard_SM_Train,SVM Standard_SM_Test,SVM MinMax_SM_Train,SVM MinMax_SM_Test,SVM Robust_SM_Train,SVM Robust_SM_Test,Logreg Standard_SM_Train,Logreg Standard_SM_Test,Logreg MinMax_SM_Train,Logreg MinMax_SM_Test,Logreg Robust_SM_Train,Logreg Robust_SM_Test
Accuracy,0.85268,0.694306,0.582661,0.145384,0.824104,0.629077,0.909267,0.865119,0.901275,0.855721,0.908621,0.86733
Recall,0.999354,0.988462,1.0,1.0,0.999354,0.980769,0.836132,0.192308,0.839845,0.261538,0.835647,0.188462
Precision,0.772688,0.318463,0.545055,0.143965,0.739988,0.276873,0.979387,0.595238,0.957482,0.49635,0.97845,0.628205
F1 Score,0.871524,0.481724,0.705547,0.251694,0.850333,0.431837,0.902108,0.290698,0.894814,0.342569,0.901428,0.289941


## Decision Tree

In [342]:
decision_tree = DecisionTreeClassifier()

In [343]:
param_dt =  {"criterion" : ['gini', 'entropy'],
             "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             "min_samples_leaf":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "max_features" : ['auto', 'sqrt', 'log2']}

### Random Over Sampling

In [344]:
dt_os = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
dt_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 852 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   20.9s finished


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [345]:
dt_tuned_os = dt_os.best_estimator_
pred_train_os = dt_tuned_os.predict(x_train_os)
pred_test_os = dt_tuned_os.predict(x_test)
dt_tuned_os

DecisionTreeClassifier(criterion='entropy', max_depth=31, max_features='auto',
                       min_samples_leaf=3, min_samples_split=6)

In [346]:
dt_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
dt_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
dt_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
dt_recall_tuned_test_os = recall_score(y_test, pred_test_os)
dt_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
dt_prec_tuned_test_os = precision_score(y_test, pred_test_os)
dt_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
dt_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [347]:
cm_dt_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_dt_tuned_os = pd.DataFrame(data=cm_dt_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,172,88
Akt 0,137,1412


In [348]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93      1549
           1       0.56      0.66      0.60       260

    accuracy                           0.88      1809
   macro avg       0.75      0.79      0.77      1809
weighted avg       0.89      0.88      0.88      1809



In [349]:
tp_dt_os = cm_dt_tuned_os['Pred 1'][0]
tn_dt_os = cm_dt_tuned_os['Pred 0'][1]
fp_dt_os = cm_dt_tuned_os['Pred 1'][1]
fn_dt_os = cm_dt_tuned_os['Pred 0'][0]

### Smote

In [350]:
dt_sm = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
dt_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   18.4s finished


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [351]:
dt_tuned_sm = dt_sm.best_estimator_
pred_train_sm = dt_tuned_sm.predict(x_train_sm)
pred_test_sm = dt_tuned_sm.predict(x_test)
dt_tuned_sm

DecisionTreeClassifier(criterion='entropy', max_depth=2, max_features='sqrt',
                       min_samples_leaf=25, min_samples_split=19)

In [352]:
dt_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
dt_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
dt_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
dt_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
dt_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
dt_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
dt_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
dt_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [353]:
cm_dt_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_dt_tuned_sm = pd.DataFrame(data=cm_dt_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,118,142
Akt 0,375,1174


In [354]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.89      0.76      0.82      1549
           1       0.24      0.45      0.31       260

    accuracy                           0.71      1809
   macro avg       0.57      0.61      0.57      1809
weighted avg       0.80      0.71      0.75      1809



In [355]:
tp_dt_sm = cm_dt_tuned_sm['Pred 1'][0]
tn_dt_sm = cm_dt_tuned_sm['Pred 0'][1]
fp_dt_sm = cm_dt_tuned_sm['Pred 1'][1]
fn_dt_sm = cm_dt_tuned_sm['Pred 0'][0]

## Random Forest

In [395]:
random_forest = RandomForestClassifier()

In [396]:
param_rf =  {"n_estimators":np.arange(100, 2000),
             "criterion" : ['gini', 'entropy'],
             "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             "min_samples_leaf":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             "max_features" : ['auto', 'sqrt', 'log2']}

### Random Over Sampling

In [397]:
rf_os = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
rf_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 33.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 60.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 68.0min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [398]:
rf_tuned_os = rf_os.best_estimator_
pred_train_os = rf_tuned_os.predict(x_train_os)
pred_test_os = rf_tuned_os.predict(x_test)
rf_tuned_os

RandomForestClassifier(criterion='entropy', max_features='log2',
                       min_samples_leaf=4, n_estimators=256)

In [399]:
rf_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
rf_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)
rf_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
rf_recall_tuned_test_os = recall_score(y_test, pred_test_os)
rf_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
rf_prec_tuned_test_os = precision_score(y_test, pred_test_os)
rf_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
rf_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [400]:
cm_rf_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_rf_tuned_os = pd.DataFrame(data=cm_rf_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,224,36
Akt 0,22,1527


In [401]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1549
           1       0.91      0.86      0.89       260

    accuracy                           0.97      1809
   macro avg       0.94      0.92      0.93      1809
weighted avg       0.97      0.97      0.97      1809



In [402]:
tp_rf_os = cm_rf_tuned_os['Pred 1'][0]
tn_rf_os = cm_rf_tuned_os['Pred 0'][1]
fp_rf_os = cm_rf_tuned_os['Pred 1'][1]
fn_rf_os = cm_rf_tuned_os['Pred 0'][0]

### Smote

In [403]:
rf_sm = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, verbose = 1, scoring = 'recall')
rf_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 36.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 67.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 76.9min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                        

In [404]:
rf_tuned_sm = rf_sm.best_estimator_
pred_train_sm = rf_tuned_sm.predict(x_train_sm)
pred_test_sm = rf_tuned_sm.predict(x_test)
rf_tuned_sm

RandomForestClassifier(criterion='entropy', max_depth=25, max_features='log2',
                       min_samples_leaf=3, min_samples_split=14,
                       n_estimators=1167)

In [405]:
rf_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
rf_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)
rf_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
rf_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)
rf_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
rf_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)
rf_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
rf_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [406]:
cm_rf_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_rf_tuned_sm = pd.DataFrame(data=cm_rf_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,79,181
Akt 0,24,1525


In [407]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.89      0.98      0.94      1549
           1       0.77      0.30      0.44       260

    accuracy                           0.89      1809
   macro avg       0.83      0.64      0.69      1809
weighted avg       0.88      0.89      0.86      1809



In [408]:
tp_rf_sm = cm_rf_tuned_sm['Pred 1'][0]
tn_rf_sm = cm_rf_tuned_sm['Pred 0'][1]
fp_rf_sm = cm_rf_tuned_sm['Pred 1'][1]
fn_rf_sm = cm_rf_tuned_sm['Pred 0'][0]

## Evaluation Matrix For Decision Tree and Random Forest

In [437]:
tuned_os = {
    "DT OS Train": [dt_acc_tuned_train_os, dt_recall_tuned_train_os, dt_prec_tuned_train_os, dt_f1_tuned_train_os],
    "DT OS Test" : [dt_acc_tuned_test_os, dt_recall_tuned_test_os, dt_prec_tuned_test_os, dt_f1_tuned_test_os],
    "RF OS Train": [rf_acc_tuned_train_os, rf_recall_tuned_train_os, rf_prec_tuned_train_os, rf_f1_tuned_train_os],
    "RF OS Test" : [rf_acc_tuned_test_os, rf_recall_tuned_test_os, rf_prec_tuned_test_os, rf_f1_tuned_test_os]}

tuned_os_matrix = pd.DataFrame(data = tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

tuned_sm = {
    "DT SM Train": [dt_acc_tuned_train_sm, dt_recall_tuned_train_sm, dt_prec_tuned_train_sm, dt_f1_tuned_train_sm],
    "DT SM Test" : [dt_acc_tuned_test_sm, dt_recall_tuned_test_sm, dt_prec_tuned_test_sm, dt_f1_tuned_test_sm],
    "RF SM Train": [rf_acc_tuned_train_sm, rf_recall_tuned_train_sm, rf_prec_tuned_train_sm, rf_f1_tuned_train_sm],
    "RF SM Test" : [rf_acc_tuned_test_sm, rf_recall_tuned_test_sm, rf_prec_tuned_test_sm, rf_f1_tuned_test_sm]}

tuned_sm_matrix = pd.DataFrame(data = tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])


In [438]:
tuned_os_matrix

Unnamed: 0,DT OS Train,DT OS Test,RF OS Train,RF OS Test
Accuracy,0.976832,0.875622,0.99669,0.967938
Recall,0.998386,0.661538,0.999677,0.861538
Precision,0.957127,0.556634,0.993741,0.910569
F1 Score,0.977321,0.604569,0.9967,0.885375


In [439]:
tuned_sm_matrix

Unnamed: 0,DT SM Train,DT SM Test,RF SM Train,RF SM Test
Accuracy,0.602922,0.714207,0.979173,0.886678
Recall,0.433484,0.453846,0.96432,0.303846
Precision,0.655678,0.239351,0.993844,0.76699
F1 Score,0.521917,0.313413,0.978859,0.435262


In [441]:
cm = {
    "True Positive" : [tp_svm_os, tp_svm_os_std, tp_svm_os_mm, tp_svm_os_rb, 
                       tp_svm_sm, tp_svm_sm_std, tp_svm_sm_mm, tp_svm_sm_rb,
                       tp_logreg_os, tp_logreg_os_std, tp_logreg_os_mm, tp_logreg_os_rb, 
                       tp_logreg_sm, tp_logreg_sm_std, tp_logreg_sm_mm, tp_logreg_sm_rb,
                       tp_dt_os, tp_dt_sm, tp_rf_os, tp_rf_sm],
    
    "True Negative" : [tn_svm_os, tn_svm_os_std, tn_svm_os_mm, tn_svm_os_rb, 
                       tn_svm_sm, tn_svm_sm_std, tn_svm_sm_mm, tn_svm_sm_rb,
                       tn_logreg_os, tn_logreg_os_std, tn_logreg_os_mm, tn_logreg_os_rb, 
                       tn_logreg_sm, tn_logreg_sm_std, tn_logreg_sm_mm, tn_logreg_sm_rb,
                       tn_dt_os, tn_dt_sm, tn_rf_os, tn_rf_sm],
    
    "False Positive": [fp_svm_os, fp_svm_os_std, fp_svm_os_mm, fp_svm_os_rb, 
                       fp_svm_sm, fp_svm_sm_std, fp_svm_sm_mm, fp_svm_sm_rb,
                       fp_logreg_os, fp_logreg_os_std, fp_logreg_os_mm, fp_logreg_os_rb, 
                       fp_logreg_sm, fp_logreg_sm_std, fp_logreg_sm_mm, fp_logreg_sm_rb,
                       fp_dt_os, fp_dt_sm, fp_rf_os, fp_rf_sm],
    
    "False Negative": [fn_svm_os, fn_svm_os_std, fn_svm_os_mm, fn_svm_os_rb, 
                       fn_svm_sm, fn_svm_sm_std, fn_svm_sm_mm, fn_svm_sm_rb,
                       fn_logreg_os, fn_logreg_os_std, fn_logreg_os_mm, fn_logreg_os_rb, 
                       fn_logreg_sm, fn_logreg_sm_std, fn_logreg_sm_mm, fn_logreg_sm_rb,
                       fn_dt_os, fn_dt_sm, fn_rf_os, fn_rf_sm]
}
    
cm_matrix = pd.DataFrame(data = cm, index = ['SVM OS', 'SVM OS Standard', 'SVM OS MinMax', 'SVM OS Robust',
                                             'SVM SM', 'SVM SM Standard', 'SVM SM MinMax', 'SVM SM Robust',
                                             'LogReg OS', 'Logreg OS Standard', 'Logreg OS MinMax', 'Logreg OS Robust',
                                             'LogReg SM', 'Logreg SM Standard', 'Logreg SM MinMax', 'Logreg SM Robust',
                                             'Decision Tree OS', 'Decision Tree SM', 'Random Forest OS', 'Random Forest SM'])
cm_matrix.sort_values('False Negative')
#Random Forest With Random Over Sampling and No Scaling

Unnamed: 0,True Positive,True Negative,False Positive,False Negative
SVM SM,260,0,1549,0
SVM SM MinMax,260,3,1546,0
SVM SM Standard,257,999,550,3
SVM SM Robust,255,883,666,5
Random Forest OS,224,1527,22,36
SVM OS,213,1542,7,47
SVM OS Standard,213,1542,7,47
SVM OS Robust,210,1539,10,50
Logreg OS Standard,194,1090,459,66
LogReg OS,191,1093,456,69


In [393]:
import joblib

In [442]:
import joblib
joblib.dump(rf_os, 'Random Forest with Random Over Sampling')

['Random Forest with Random Over Sampling']

In [445]:
x = rf_os.predict_proba([[3826, 0, 2, 1, 25000, 54, 2, 5200, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])
x

array([[0.37953631, 0.62046369]])

In [446]:
y = rf_os.predict([[3826, 0, 2, 1, 25000, 54, 2, 5200, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])
y

array([1], dtype=int64)