In [90]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## Dataset

In [91]:
df = pd.read_csv("data_clean.csv", index_col = 0)
df.head()

Unnamed: 0_level_0,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,...,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Income_group,Claim_group,clv_group,premi_group,numpolicy_group
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,Washington,2763.519279,No,Basic,Bachelor,2011-02-24,Employed,F,56274,Suburban,...,Offer1,Agent,384.811147,Two-Door Car,Medsize,High Income,up to 400,up to 6.000,0 - 85 dollar,Low
QZ44356,Arizona,6979.535903,No,Extended,Bachelor,2011-01-31,Unemployed,F,0,Suburban,...,Offer3,Agent,1131.464935,Four-Door Car,Medsize,Low Income,up to 1500,up to 30.000,85 - 150 dollar,High
AI49188,Nevada,12887.43165,No,Premium,Bachelor,2011-02-19,Employed,F,48767,Suburban,...,Offer1,Agent,566.472247,Two-Door Car,Medsize,Medium Income,up to 1500,up to 30.000,85 - 150 dollar,Low
WW63253,California,7645.861827,No,Basic,Bachelor,2011-01-20,Unemployed,M,0,Suburban,...,Offer1,Call Center,529.881344,SUV,Medsize,Low Income,up to 1500,up to 30.000,85 - 150 dollar,High
HB64268,Washington,2813.692575,No,Basic,Bachelor,2011-02-03,Employed,M,43836,Rural,...,Offer1,Agent,138.130879,Four-Door Car,Medsize,Medium Income,up to 400,up to 6.000,0 - 85 dollar,Low


In [92]:
df['Response'].value_counts()

No     7743
Yes    1302
Name: Response, dtype: int64

In [93]:
df.drop(['Policy Type', 'Income_group', 'Claim_group', 'clv_group', 'premi_group', 'Effective To Date', 'Number of Open Complaints', 'Months Since Last Claim', 'Months Since Policy Inception', 'numpolicy_group'], axis = 1, inplace = True)

# Encoding

In [94]:
#Label Encoding
df['Gender'] = df['Gender'].map({'M' : 0, 'F' : 1})
df['Coverage'] = df['Coverage'].map({'Basic' : 0, 'Extended' : 1, 'Premium' : 2})
df['Education'] = df['Education'].map({'High School or Below' : 0, 'College' : 1, 'Bachelor' : 2, 'Master' : 3, 'Doctor' : 4})
df['Vehicle Size'] = df['Vehicle Size'].map({'Small' : 0, 'Medsize' : 1, 'Large' : 2})
df['Response'] = df['Response'].map({'No' : 0, 'Yes' : 1})

In [95]:
df.head()

Unnamed: 0_level_0,State,Customer Lifetime Value,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Number of Policies,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
BU79786,Washington,2763.519279,0,0,2,Employed,1,56274,Suburban,Married,69,1,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,1
QZ44356,Arizona,6979.535903,0,1,2,Unemployed,1,0,Suburban,Single,94,8,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,1
AI49188,Nevada,12887.43165,0,2,2,Employed,1,48767,Suburban,Married,108,2,Personal L3,Offer1,Agent,566.472247,Two-Door Car,1
WW63253,California,7645.861827,0,0,2,Unemployed,0,0,Suburban,Married,106,7,Corporate L2,Offer1,Call Center,529.881344,SUV,1
HB64268,Washington,2813.692575,0,0,2,Employed,0,43836,Rural,Single,73,1,Personal L1,Offer1,Agent,138.130879,Four-Door Car,1


In [96]:
to_encode = ['State', 'EmploymentStatus', 'Location Code', 'Marital Status', 'Policy', 'Renew Offer Type', 'Sales Channel', 'Vehicle Class']
df_processed = pd.get_dummies(df, prefix_sep="_", columns=to_encode)

In [97]:
df_processed

Unnamed: 0_level_0,Customer Lifetime Value,Response,Coverage,Education,Gender,Income,Monthly Premium Auto,Number of Policies,Total Claim Amount,Vehicle Size,...,Sales Channel_Agent,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,2763.519279,0,0,2,1,56274,69,1,384.811147,1,...,1,0,0,0,0,0,0,0,0,1
QZ44356,6979.535903,0,1,2,1,0,94,8,1131.464935,1,...,1,0,0,0,1,0,0,0,0,0
AI49188,12887.431650,0,2,2,1,48767,108,2,566.472247,1,...,1,0,0,0,0,0,0,0,0,1
WW63253,7645.861827,0,0,2,0,0,106,7,529.881344,1,...,0,0,1,0,0,0,0,1,0,0
HB64268,2813.692575,0,0,2,0,43836,73,1,138.130879,1,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA72316,23405.987980,0,0,2,0,71941,73,2,198.234764,1,...,0,0,0,1,1,0,0,0,0,0
PK87824,3096.511217,1,1,1,1,21604,79,1,379.200000,1,...,0,1,0,0,1,0,0,0,0,0
TD14365,8163.890428,0,1,2,0,0,85,2,790.784983,1,...,0,1,0,0,1,0,0,0,0,0
UP19263,7524.442436,0,1,1,0,21941,96,3,691.200000,2,...,0,1,0,0,1,0,0,0,0,0


# Correlation

In [98]:
cor = df_processed.corr()
cor['Response'].sort_values(ascending = False)[1:]

EmploymentStatus_Retired          0.296120
Renew Offer Type_Offer2           0.177755
Location Code_Suburban            0.115491
Marital Status_Divorced           0.113316
Sales Channel_Agent               0.110324
Vehicle Size                      0.054557
Renew Offer Type_Offer1           0.036612
Vehicle Class_Sports Car          0.028910
Vehicle Class_SUV                 0.024932
Education                         0.024286
Policy_Special L3                 0.024070
EmploymentStatus_Medical Leave    0.023834
EmploymentStatus_Disabled         0.021110
Total Claim Amount                0.013910
Income                            0.012729
Monthly Premium Auto              0.009987
Policy_Special L1                 0.009249
Policy_Corporate L3               0.008236
Vehicle Class_Luxury SUV          0.008182
Policy_Personal L1                0.006516
State_Oregon                      0.003027
State_California                  0.002574
Policy_Corporate L2               0.001922
Policy_Pers

In [99]:
df_processed.to_csv('data_ml.csv')

# Splitting Data

In [100]:
x = df_processed.drop(columns = ['Response'] , axis = 1)
y = df_processed['Response']

In [101]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, stratify= y, random_state = 42)

# Handling Imbalance Data

In [102]:
y_train.value_counts()

0    6194
1    1042
Name: Response, dtype: int64

#### Random Over Sampling

In [103]:
df_train = pd.concat([x_train, y_train], axis=1)
not_renewal = df_train[df_train['Response'] == 0]
renewal = df_train[df_train['Response'] == 1]

renewal_oversample = resample(renewal, replace=True, n_samples = len(not_renewal), random_state = 42)
df_OverSampled = pd.concat([not_renewal, renewal_oversample])
df_OverSampled['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [104]:
x_train_os = df_OverSampled.drop(columns = ['Response'])
y_train_os = df_OverSampled['Response']

#### Smote

In [105]:
sm = SMOTE(random_state=42)
x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)
x = pd.DataFrame(data = x_train_sm, columns = x_train.columns)
y = pd.DataFrame(data = y_train_sm, columns = ['Response'])
df_smote = x.join(y)
# df_smote = pd.concat([x_train_sm, y_train_sm], axis = 1)
df_smote['Response'].value_counts()

1    6194
0    6194
Name: Response, dtype: int64

In [106]:
columns_continuous = ['Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Policies', 'Total Claim Amount']

# Base Model

## SVM

#### Random Over Sampling without Scaling

In [107]:
svm = SVC()
svm.fit(x_train_os, y_train_os)

SVC()

In [108]:
pred_train_os = svm.predict(x_train_os)
pred_test_os = svm.predict(x_test)

In [109]:
svm_acc_base_train_os = accuracy_score(y_train_os, pred_train_os)
svm_acc_base_test_os = accuracy_score(y_test, pred_test_os)
svm_recall_base_train_os = recall_score(y_train_os, pred_train_os)
svm_recall_base_test_os = recall_score(y_test, pred_test_os)
svm_prec_base_train_os = precision_score(y_train_os, pred_train_os)
svm_prec_base_test_os = precision_score(y_test, pred_test_os)
svm_f1_base_train_os = f1_score(y_train_os, pred_train_os)
svm_f1_base_test_os = f1_score(y_test, pred_test_os)

In [110]:
cm_svm_base_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_svm_base_os = pd.DataFrame(data=cm_svm_base_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,183,77
Akt 0,891,658


In [111]:
tp_svm_os = cm_svm_base_os['Pred 1'][0]
tn_svm_os = cm_svm_base_os['Pred 0'][1]
fp_svm_os = cm_svm_base_os['Pred 1'][1]
fn_svm_os = cm_svm_base_os['Pred 0'][0]

#### Smote without Scaling

In [112]:
svm.fit(x_train_sm, y_train_sm)

SVC()

In [113]:
pred_train_sm = svm.predict(x_train_sm)
pred_test_sm = svm.predict(x_test)

In [114]:
svm_acc_base_train_sm = accuracy_score(y_train_sm, pred_train_sm)
svm_acc_base_test_sm = accuracy_score(y_test, pred_test_sm)
svm_recall_base_train_sm = recall_score(y_train_sm, pred_train_sm)
svm_recall_base_test_sm = recall_score(y_test, pred_test_sm)
svm_prec_base_train_sm = precision_score(y_train_sm, pred_train_sm)
svm_prec_base_test_sm = precision_score(y_test, pred_test_sm)
svm_f1_base_train_sm = f1_score(y_train_sm, pred_train_sm)
svm_f1_base_test_sm = f1_score(y_test, pred_test_sm)

In [115]:
cm_svm_base_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_svm_base_sm = pd.DataFrame(data=cm_svm_base_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,182,78
Akt 0,881,668


In [116]:
tp_svm_sm = cm_svm_base_sm['Pred 1'][0]
tn_svm_sm = cm_svm_base_sm['Pred 0'][1]
fp_svm_sm = cm_svm_base_sm['Pred 1'][1]
fn_svm_sm = cm_svm_base_sm['Pred 0'][0]

#### Random Over Sampling with Standard Scaler

In [117]:
std_scale = StandardScaler()
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
svm.fit(x_train_os, y_train_os)

pred_train_os_std = svm.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = svm.predict(x_test)

In [118]:
svm_acc_base_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
svm_acc_base_test_os_std = accuracy_score(y_test, pred_test_os_std)
svm_recall_base_train_os_std = recall_score(y_train_os, pred_train_os_std)
svm_recall_base_test_os_std = recall_score(y_test, pred_test_os_std)
svm_prec_base_train_os_std = precision_score(y_train_os, pred_train_os_std)
svm_prec_base_test_os_std = precision_score(y_test, pred_test_os_std)
svm_f1_base_train_os_std = f1_score(y_train_os, pred_train_os_std)
svm_f1_base_test_os_std = f1_score(y_test, pred_test_os_std)

In [119]:
cm_svm_base_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_svm_base_os_std = pd.DataFrame(data=cm_svm_base_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,240,20
Akt 0,205,1344


In [120]:
tp_svm_os_std = cm_svm_base_os_std['Pred 1'][0]
tn_svm_os_std = cm_svm_base_os_std['Pred 0'][1]
fp_svm_os_std = cm_svm_base_os_std['Pred 1'][1]
fn_svm_os_std = cm_svm_base_os_std['Pred 0'][0]

#### Random Over Sampling with MinMax Scaler

In [121]:
mm_scale = MinMaxScaler()
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
svm.fit(x_train_os, y_train_os)

pred_train_os_mm = svm.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = svm.predict(x_test)

In [122]:
svm_acc_base_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
svm_acc_base_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
svm_recall_base_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
svm_recall_base_test_os_mm = recall_score(y_test, pred_test_os_mm)
svm_prec_base_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
svm_prec_base_test_os_mm = precision_score(y_test, pred_test_os_mm)
svm_f1_base_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
svm_f1_base_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [123]:
cm_svm_base_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_svm_base_os_mm = pd.DataFrame(data=cm_svm_base_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,238,22
Akt 0,242,1307


In [124]:
tp_svm_os_mm = cm_svm_base_os_mm['Pred 1'][0]
tn_svm_os_mm = cm_svm_base_os_mm['Pred 0'][1]
fp_svm_os_mm = cm_svm_base_os_mm['Pred 1'][1]
fn_svm_os_mm = cm_svm_base_os_mm['Pred 0'][0]

#### Random Over Sampling with Robust Scaler

In [125]:
rb_scale = RobustScaler()
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
svm.fit(x_train_os, y_train_os)

pred_train_os_rb = svm.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = svm.predict(x_test)

In [126]:
svm_acc_base_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
svm_acc_base_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
svm_recall_base_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
svm_recall_base_test_os_rb = recall_score(y_test, pred_test_os_rb)
svm_prec_base_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
svm_prec_base_test_os_rb = precision_score(y_test, pred_test_os_rb)
svm_f1_base_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
svm_f1_base_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [127]:
cm_svm_base_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_svm_base_os_rb = pd.DataFrame(data=cm_svm_base_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,240,20
Akt 0,207,1342


In [128]:
tp_svm_os_rb = cm_svm_base_os_rb['Pred 1'][0]
tn_svm_os_rb = cm_svm_base_os_rb['Pred 0'][1]
fp_svm_os_rb = cm_svm_base_os_rb['Pred 1'][1]
fn_svm_os_rb = cm_svm_base_os_rb['Pred 0'][0]

#### Smote with Standard Scaler

In [129]:
x[columns_continuous] = std_scale.fit_transform(x[columns_continuous])
svm.fit(x_train_sm, y_train_sm)

pred_train_sm_std = svm.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = svm.predict(x_test)

In [130]:
svm_acc_base_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
svm_acc_base_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
svm_recall_base_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
svm_recall_base_test_sm_std = recall_score(y_test, pred_test_sm_std)
svm_prec_base_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
svm_prec_base_test_sm_std = precision_score(y_test, pred_test_sm_std)
svm_f1_base_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
svm_f1_base_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [131]:
cm_svm_base_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_svm_base_sm_std = pd.DataFrame(data=cm_svm_base_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,84,176
Akt 0,16,1533


In [132]:
tp_svm_sm_std = cm_svm_base_sm_std['Pred 1'][0]
tn_svm_sm_std = cm_svm_base_sm_std['Pred 0'][1]
fp_svm_sm_std = cm_svm_base_sm_std['Pred 1'][1]
fn_svm_sm_std = cm_svm_base_sm_std['Pred 0'][0]

#### Smote with MinMax Scaler

In [133]:
mm_scale = MinMaxScaler()
x[columns_continuous] = mm_scale.fit_transform(x[columns_continuous])
svm.fit(x_train_sm, y_train_sm)

pred_train_sm_mm = svm.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = svm.predict(x_test)

In [134]:
svm_acc_base_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
svm_acc_base_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
svm_recall_base_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
svm_recall_base_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
svm_prec_base_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
svm_prec_base_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
svm_f1_base_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
svm_f1_base_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [135]:
cm_svm_base_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_svm_base_sm_mm = pd.DataFrame(data=cm_svm_base_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,50,210
Akt 0,22,1527


In [136]:
tp_svm_sm_mm = cm_svm_base_sm_mm['Pred 1'][0]
tn_svm_sm_mm = cm_svm_base_sm_mm['Pred 0'][1]
fp_svm_sm_mm = cm_svm_base_sm_mm['Pred 1'][1]
fn_svm_sm_mm = cm_svm_base_sm_mm['Pred 0'][0]

#### Smote with Robust Scaler

In [137]:
rb_scale = RobustScaler()
x[columns_continuous] = rb_scale.fit_transform(x[columns_continuous])
svm.fit(x_train_sm, y_train_sm)

pred_train_sm_rb = svm.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = svm.predict(x_test)

In [138]:
svm_acc_base_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
svm_acc_base_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
svm_recall_base_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
svm_recall_base_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
svm_prec_base_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
svm_prec_base_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
svm_f1_base_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
svm_f1_base_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [139]:
cm_svm_base_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_svm_base_sm_rb = pd.DataFrame(data=cm_svm_base_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_base_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,69,191
Akt 0,12,1537


In [140]:
tp_svm_sm_rb = cm_svm_base_sm_rb['Pred 1'][0]
tn_svm_sm_rb = cm_svm_base_sm_rb['Pred 0'][1]
fp_svm_sm_rb = cm_svm_base_sm_rb['Pred 1'][1]
fn_svm_sm_rb = cm_svm_base_sm_rb['Pred 0'][0]

## Logistic Regression

#### Random Over Sampling without Scaling

In [141]:
logreg = LogisticRegression()
logreg.fit(x_train_os, y_train_os)
pred_train_os = logreg.predict(x_train_os)
pred_test_os = logreg.predict(x_test)

In [142]:
logreg_acc_base_train_os = accuracy_score(y_train_os, pred_train_os)
logreg_acc_base_test_os = accuracy_score(y_test, pred_test_os)
logreg_recall_base_train_os = recall_score(y_train_os, pred_train_os)
logreg_recall_base_test_os = recall_score(y_test, pred_test_os)
logreg_prec_base_train_os = precision_score(y_train_os, pred_train_os)
logreg_prec_base_test_os = precision_score(y_test, pred_test_os)
logreg_f1_base_train_os = f1_score(y_train_os, pred_train_os)
logreg_f1_base_test_os = f1_score(y_test, pred_test_os)

In [143]:
cm_logreg_base_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_logreg_base_os = pd.DataFrame(data=cm_logreg_base_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,191,69
Akt 0,454,1095


In [144]:
tp_logreg_os = cm_logreg_base_os['Pred 1'][0]
tn_logreg_os = cm_logreg_base_os['Pred 0'][1]
fp_logreg_os = cm_logreg_base_os['Pred 1'][1]
fn_logreg_os = cm_logreg_base_os['Pred 0'][0]

#### Smote Sampling without Scaling

In [145]:
logreg.fit(x_train_sm, y_train_sm)
pred_train_sm = logreg.predict(x_train_sm)
pred_test_sm = logreg.predict(x_test)

In [146]:
logreg_acc_base_train_sm = accuracy_score(y_train_sm, pred_train_sm)
logreg_acc_base_test_sm = accuracy_score(y_test, pred_test_sm)
logreg_recall_base_train_sm = recall_score(y_train_sm, pred_train_sm)
logreg_recall_base_test_sm = recall_score(y_test, pred_test_sm)
logreg_prec_base_train_sm = precision_score(y_train_sm, pred_train_sm)
logreg_prec_base_test_sm = precision_score(y_test, pred_test_sm)
logreg_f1_base_train_sm = f1_score(y_train_sm, pred_train_sm)
logreg_f1_base_test_sm = f1_score(y_test, pred_test_sm)

In [147]:
cm_logreg_base_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_logreg_base_sm = pd.DataFrame(data=cm_logreg_base_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,45,215
Akt 0,21,1528


In [148]:
tp_logreg_sm = cm_logreg_base_sm['Pred 1'][0]
tn_logreg_sm = cm_logreg_base_sm['Pred 0'][1]
fp_logreg_sm = cm_logreg_base_sm['Pred 1'][1]
fn_logreg_sm = cm_logreg_base_sm['Pred 0'][0]

#### Standard Scaling with Random Over Sampling

In [149]:
std_scale = StandardScaler()
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])
logreg.fit(x_train_os, y_train_os)

pred_train_os_std = logreg.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = logreg.predict(x_test)

In [150]:
logreg_acc_base_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
logreg_acc_base_test_os_std = accuracy_score(y_test, pred_test_os_std)
logreg_recall_base_train_os_std = recall_score(y_train_os, pred_train_os_std)
logreg_recall_base_test_os_std = recall_score(y_test, pred_test_os_std)
logreg_prec_base_train_os_std = precision_score(y_train_os, pred_train_os_std)
logreg_prec_base_test_os_std = precision_score(y_test, pred_test_os_std)
logreg_f1_base_train_os_std = f1_score(y_train_os, pred_train_os_std)
logreg_f1_base_test_os_std = f1_score(y_test, pred_test_os_std)

In [151]:
cm_logreg_base_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])
cm_logreg_base_os_std = pd.DataFrame(data=cm_logreg_base_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,194,66
Akt 0,459,1090


In [152]:
tp_logreg_os_std = cm_logreg_base_os_std['Pred 1'][0]
tn_logreg_os_std = cm_logreg_base_os_std['Pred 0'][1]
fp_logreg_os_std = cm_logreg_base_os_std['Pred 1'][1]
fn_logreg_os_std = cm_logreg_base_os_std['Pred 0'][0]

#### MinMax Scaling with Random Over Sampling

In [153]:
mm_scale = MinMaxScaler()
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])
logreg.fit(x_train_os, y_train_os)

pred_train_os_mm = logreg.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = logreg.predict(x_test)

In [154]:
logreg_acc_base_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
logreg_acc_base_test_os_mm = accuracy_score(y_test, pred_test_os_mm)
logreg_recall_base_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
logreg_recall_base_test_os_mm = recall_score(y_test, pred_test_os_mm)
logreg_prec_base_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
logreg_prec_base_test_os_mm = precision_score(y_test, pred_test_os_mm)
logreg_f1_base_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
logreg_f1_base_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [155]:
cm_logreg_base_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])
cm_logreg_base_os_mm = pd.DataFrame(data=cm_logreg_base_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,184,76
Akt 0,425,1124


In [156]:
tp_logreg_os_mm = cm_logreg_base_os_mm['Pred 1'][0]
tn_logreg_os_mm = cm_logreg_base_os_mm['Pred 0'][1]
fp_logreg_os_mm = cm_logreg_base_os_mm['Pred 1'][1]
fn_logreg_os_mm = cm_logreg_base_os_mm['Pred 0'][0]

#### Robust Scaling with Random Over Sampling

In [157]:
rb_scale = RobustScaler()
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])
logreg.fit(x_train_os, y_train_os)

pred_train_os_rb = logreg.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = logreg.predict(x_test)

In [158]:
logreg_acc_base_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
logreg_acc_base_test_os_rb = accuracy_score(y_test, pred_test_os_rb)
logreg_recall_base_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
logreg_recall_base_test_os_rb = recall_score(y_test, pred_test_os_rb)
logreg_prec_base_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
logreg_prec_base_test_os_rb = precision_score(y_test, pred_test_os_rb)
logreg_f1_base_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
logreg_f1_base_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [159]:
cm_logreg_base_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])
cm_logreg_base_os_rb = pd.DataFrame(data=cm_logreg_base_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,191,69
Akt 0,454,1095


In [160]:
tp_logreg_os_rb = cm_logreg_base_os_rb['Pred 1'][0]
tn_logreg_os_rb = cm_logreg_base_os_rb['Pred 0'][1]
fp_logreg_os_rb = cm_logreg_base_os_rb['Pred 1'][1]
fn_logreg_os_rb = cm_logreg_base_os_rb['Pred 0'][0]

#### Standard Scaling with Smote

In [161]:
x[columns_continuous] = std_scale.fit_transform(x[columns_continuous])
logreg.fit(x_train_sm, y_train_sm)

pred_train_sm_std = logreg.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = logreg.predict(x_test)

In [162]:
logreg_acc_base_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
logreg_acc_base_test_sm_std = accuracy_score(y_test, pred_test_sm_std)
logreg_recall_base_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
logreg_recall_base_test_sm_std = recall_score(y_test, pred_test_sm_std)
logreg_prec_base_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
logreg_prec_base_test_sm_std = precision_score(y_test, pred_test_sm_std)
logreg_f1_base_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
logreg_f1_base_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [163]:
cm_logreg_base_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])
cm_logreg_base_sm_std = pd.DataFrame(data=cm_logreg_base_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,52,208
Akt 0,35,1514


In [164]:
tp_logreg_sm_std = cm_logreg_base_sm_std['Pred 1'][0]
tn_logreg_sm_std = cm_logreg_base_sm_std['Pred 0'][1]
fp_logreg_sm_std = cm_logreg_base_sm_std['Pred 1'][1]
fn_logreg_sm_std = cm_logreg_base_sm_std['Pred 0'][0]

#### MinMax Scaling with Smote

In [165]:
mm_scale = MinMaxScaler()
x[columns_continuous] = mm_scale.fit_transform(x[columns_continuous])
logreg.fit(x_train_sm, y_train_sm)

pred_train_sm_mm = logreg.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = logreg.predict(x_test)

In [166]:
logreg_acc_base_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
logreg_acc_base_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)
logreg_recall_base_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
logreg_recall_base_test_sm_mm = recall_score(y_test, pred_test_sm_mm)
logreg_prec_base_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
logreg_prec_base_test_sm_mm = precision_score(y_test, pred_test_sm_mm)
logreg_f1_base_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
logreg_f1_base_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [167]:
cm_logreg_base_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])
cm_logreg_base_sm_mm = pd.DataFrame(data=cm_logreg_base_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,54,206
Akt 0,40,1509


In [168]:
tp_logreg_sm_mm = cm_logreg_base_sm_mm['Pred 1'][0]
tn_logreg_sm_mm = cm_logreg_base_sm_mm['Pred 0'][1]
fp_logreg_sm_mm = cm_logreg_base_sm_mm['Pred 1'][1]
fn_logreg_sm_mm = cm_logreg_base_sm_mm['Pred 0'][0]

#### Robust Scaling with Smote

In [169]:
rb_scale = RobustScaler()
x[columns_continuous] = rb_scale.fit_transform(x[columns_continuous])
logreg.fit(x_train_sm, y_train_sm)

pred_train_sm_rb = logreg.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = logreg.predict(x_test)

In [170]:
logreg_acc_base_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
logreg_acc_base_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)
logreg_recall_base_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
logreg_recall_base_test_sm_rb = recall_score(y_test, pred_test_sm_rb)
logreg_prec_base_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
logreg_prec_base_test_sm_rb = precision_score(y_test, pred_test_sm_rb)
logreg_f1_base_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
logreg_f1_base_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [171]:
cm_logreg_base_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])
cm_logreg_base_sm_rb = pd.DataFrame(data=cm_logreg_base_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_base_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,45,215
Akt 0,21,1528


In [172]:
tp_logreg_sm_rb = cm_logreg_base_sm_rb['Pred 1'][0]
tn_logreg_sm_rb = cm_logreg_base_sm_rb['Pred 0'][1]
fp_logreg_sm_rb = cm_logreg_base_sm_rb['Pred 1'][1]
fn_logreg_sm_rb = cm_logreg_base_sm_rb['Pred 0'][0]

### Evaluation For SVM and Logistic Regression

In [173]:
distance_base = {
    "SVM_OS_Train": [svm_acc_base_train_os, svm_recall_base_train_os, svm_prec_base_train_os, svm_f1_base_train_os],
    "SVM_OS_Test" : [svm_acc_base_test_os, svm_recall_base_test_os, svm_prec_base_test_os, svm_f1_base_test_os],
    "SVM_SM_Train": [svm_acc_base_train_sm, svm_recall_base_train_sm, svm_prec_base_train_sm, svm_f1_base_train_sm],
    "SVM_SM_Test" : [svm_acc_base_test_sm, svm_recall_base_test_sm, svm_prec_base_test_sm, svm_f1_base_test_sm],
    "Logreg_OS_Train": [logreg_acc_base_train_os, logreg_recall_base_train_os, logreg_prec_base_train_os, logreg_f1_base_train_os],
    "Logreg_OS_Test" : [logreg_acc_base_test_os, logreg_recall_base_test_os, logreg_prec_base_test_os, logreg_f1_base_test_os],
    "Logreg_SM_Train": [logreg_acc_base_train_sm, logreg_recall_base_train_sm, logreg_prec_base_train_sm, logreg_f1_base_train_sm],
    "Logreg_SM_Test" : [logreg_acc_base_test_sm, logreg_recall_base_test_sm, logreg_prec_base_test_sm, logreg_f1_base_test_sm]
    }
base_matrix = pd.DataFrame(data = distance_base, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

dictance_base_os = {
    "SVM Standard_OS_Train": [svm_acc_base_train_os_std, svm_recall_base_train_os_std, svm_prec_base_train_os_std, svm_f1_base_train_os_std],
    "SVM Standard_OS_Test" : [svm_acc_base_test_os_std, svm_recall_base_test_os_std, svm_prec_base_test_os_std, svm_f1_base_test_os_std],
    "SVM MinMax_OS_Train": [svm_acc_base_train_os_mm, svm_recall_base_train_os_mm, svm_prec_base_train_os_mm, svm_f1_base_train_os_mm],
    "SVM MinMax_OS_Test" : [svm_acc_base_test_os_mm, svm_recall_base_test_os_mm, svm_prec_base_test_os_mm, svm_f1_base_test_os_mm],
    "SVM Robust_OS_Train": [svm_acc_base_train_os_rb, svm_recall_base_train_os_rb, svm_prec_base_train_os_rb, svm_f1_base_train_os_rb],
    "SVM Robust_OS_Test" : [svm_acc_base_test_os_rb, svm_recall_base_test_os_rb, svm_prec_base_test_os_rb, svm_f1_base_test_os_rb],
    "Logreg Standard_OS_Train": [logreg_acc_base_train_os_std, logreg_recall_base_train_os_std, logreg_prec_base_train_os_std, logreg_f1_base_train_os_std],
    "Logreg Standard_OS_Test" : [logreg_acc_base_test_os_std, logreg_recall_base_test_os_std, logreg_prec_base_test_os_std, logreg_f1_base_test_os_std],
    "Logreg MinMax_OS_Train": [logreg_acc_base_train_os_mm, logreg_recall_base_train_os_mm, logreg_prec_base_train_os_mm, logreg_f1_base_train_os_mm],
    "Logreg MinMax_OS_Test" : [logreg_acc_base_test_os_mm, logreg_recall_base_test_os_mm, logreg_prec_base_test_os_mm, logreg_f1_base_test_os_mm],
    "Logreg Robust_OS_Train": [logreg_acc_base_train_os_rb, logreg_recall_base_train_os_rb, logreg_prec_base_train_os_rb, logreg_f1_base_train_os_rb],
    "Logreg Robust_OS_Test" : [logreg_acc_base_test_os_rb, logreg_recall_base_test_os_rb, logreg_prec_base_test_os_rb, logreg_f1_base_test_os_rb]
    }
distance_base_os_matrix = pd.DataFrame(data = dictance_base_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

distance_base_sm = {
    "SVM Standard_SM_Train": [svm_acc_base_train_sm_std, svm_recall_base_train_sm_std, svm_prec_base_train_sm_std, svm_f1_base_train_sm_std],
    "SVM Standard_SM_Test" : [svm_acc_base_test_sm_std, svm_recall_base_test_sm_std, svm_prec_base_test_sm_std, svm_f1_base_test_sm_std],
    "SVM MinMax_SM_Train": [svm_acc_base_train_sm_mm, svm_recall_base_train_sm_mm, svm_prec_base_train_sm_mm, svm_f1_base_train_sm_mm],
    "SVM MinMax_SM_Test" : [svm_acc_base_test_sm_mm, svm_recall_base_test_sm_mm, svm_prec_base_test_sm_mm, svm_f1_base_test_sm_mm],
    "SVM Robust_SM_Train": [svm_acc_base_train_sm_rb, svm_recall_base_train_sm_rb, svm_prec_base_train_sm_rb, svm_f1_base_train_sm_rb],
    "SVM Robust_SM_Test" : [svm_acc_base_test_sm_rb, svm_recall_base_test_sm_rb, svm_prec_base_test_sm_rb, svm_f1_base_test_sm_rb],
    "Logreg Standard_SM_Train": [logreg_acc_base_train_sm_std, logreg_recall_base_train_sm_std, logreg_prec_base_train_sm_std, logreg_f1_base_train_sm_std],
    "Logreg Standard_SM_Test" : [logreg_acc_base_test_sm_std, logreg_recall_base_test_sm_std, logreg_prec_base_test_sm_std, logreg_f1_base_test_sm_std],
    "Logreg MinMax_SM_Train": [logreg_acc_base_train_sm_mm, logreg_recall_base_train_sm_mm, logreg_prec_base_train_sm_mm, logreg_f1_base_train_sm_mm],
    "Logreg MinMax_SM_Test" : [logreg_acc_base_test_sm_mm, logreg_recall_base_test_sm_mm, logreg_prec_base_test_sm_mm, logreg_f1_base_test_sm_mm],
    "Logreg Robust_SM_Train": [logreg_acc_base_train_sm_rb, logreg_recall_base_train_sm_rb, logreg_prec_base_train_sm_rb, logreg_f1_base_train_sm_rb],
    "Logreg Robust_SM_Test" : [logreg_acc_base_test_sm_rb, logreg_recall_base_test_sm_rb, logreg_prec_base_test_sm_rb, logreg_f1_base_test_sm_rb]
    }
distance_base_sm_matrix = pd.DataFrame(data = distance_base_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

In [174]:
base_matrix

Unnamed: 0,SVM_OS_Train,SVM_OS_Test,SVM_SM_Train,SVM_SM_Test,Logreg_OS_Train,Logreg_OS_Test,Logreg_SM_Train,Logreg_SM_Test
Accuracy,0.57814,0.464898,0.581208,0.469873,0.742412,0.71089,0.908621,0.869541
Recall,0.734743,0.703846,0.731837,0.7,0.783662,0.734615,0.836132,0.173077
Precision,0.559503,0.170391,0.562407,0.171214,0.723937,0.296124,0.977908,0.681818
F1 Score,0.63526,0.274363,0.636032,0.275132,0.752616,0.422099,0.90148,0.276074


In [175]:
distance_base_os_matrix
#SVM Robust OS

Unnamed: 0,SVM Standard_OS_Train,SVM Standard_OS_Test,SVM MinMax_OS_Train,SVM MinMax_OS_Test,SVM Robust_OS_Train,SVM Robust_OS_Test,Logreg Standard_OS_Train,Logreg Standard_OS_Test,Logreg MinMax_OS_Train,Logreg MinMax_OS_Test,Logreg Robust_OS_Train,Logreg Robust_OS_Test
Accuracy,0.938327,0.875622,0.920972,0.854063,0.936794,0.874516,0.741928,0.709784,0.743219,0.723051,0.741766,0.71089
Recall,0.982241,0.923077,0.973846,0.915385,0.980949,0.923077,0.782531,0.746154,0.785438,0.707692,0.782531,0.734615
Precision,0.902939,0.539326,0.880713,0.495833,0.90135,0.536913,0.723757,0.29709,0.724282,0.302135,0.723541,0.296124
F1 Score,0.940922,0.680851,0.924941,0.643243,0.939467,0.678925,0.751998,0.424973,0.753621,0.423475,0.751881,0.422099


In [176]:
distance_base_sm_matrix

Unnamed: 0,SVM Standard_SM_Train,SVM Standard_SM_Test,SVM MinMax_SM_Train,SVM MinMax_SM_Test,SVM Robust_SM_Train,SVM Robust_SM_Test,Logreg Standard_SM_Train,Logreg Standard_SM_Test,Logreg MinMax_SM_Train,Logreg MinMax_SM_Test,Logreg Robust_SM_Train,Logreg Robust_SM_Test
Accuracy,0.951405,0.893864,0.951405,0.871752,0.951405,0.887783,0.908621,0.865672,0.908621,0.864013,0.908621,0.869541
Recall,0.91072,0.323077,0.91072,0.192308,0.91072,0.265385,0.836132,0.2,0.836132,0.207692,0.836132,0.173077
Precision,0.991388,0.84,0.991388,0.694444,0.991388,0.851852,0.977908,0.597701,0.977908,0.574468,0.977908,0.681818
F1 Score,0.949344,0.466667,0.949344,0.301205,0.949344,0.404692,0.90148,0.299712,0.90148,0.305085,0.90148,0.276074


## Decision Tree

### Random Over Sampling

In [177]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train_os, y_train_os)
pred_train_os = decision_tree.predict(x_train_os)
pred_test_os = decision_tree.predict(x_test)

In [178]:
dt_acc_base_train_os = accuracy_score(y_train_os, pred_train_os)
dt_acc_base_test_os = accuracy_score(y_test, pred_test_os)
dt_recall_base_train_os = recall_score(y_train_os, pred_train_os)
dt_recall_base_test_os = recall_score(y_test, pred_test_os)
dt_prec_base_train_os = precision_score(y_train_os, pred_train_os)
dt_prec_base_test_os = precision_score(y_test, pred_test_os)
dt_f1_base_train_os = f1_score(y_train_os, pred_train_os)
dt_f1_base_test_os = f1_score(y_test, pred_test_os)

In [179]:
cm_dt_base_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_dt_base_os = pd.DataFrame(data=cm_dt_base_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_base_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,131,129
Akt 0,50,1499


In [180]:
tp_dt_os = cm_dt_base_os['Pred 1'][0]
tn_dt_os = cm_dt_base_os['Pred 0'][1]
fp_dt_os = cm_dt_base_os['Pred 1'][1]
fn_dt_os = cm_dt_base_os['Pred 0'][0]

### Smote

In [181]:
decision_tree.fit(x_train_sm, y_train_sm)
pred_train_sm = decision_tree.predict(x_train_sm)
pred_test_sm = decision_tree.predict(x_test)

In [182]:
dt_acc_base_train_sm = accuracy_score(y_train_sm, pred_train_sm)
dt_acc_base_test_sm = accuracy_score(y_test, pred_test_sm)
dt_recall_base_train_sm = recall_score(y_train_sm, pred_train_sm)
dt_recall_base_test_sm = recall_score(y_test, pred_test_sm)
dt_prec_base_train_sm = precision_score(y_train_sm, pred_train_sm)
dt_prec_base_test_sm = precision_score(y_test, pred_test_sm)
dt_f1_base_train_sm = f1_score(y_train_sm, pred_train_sm)
dt_f1_base_test_sm = f1_score(y_test, pred_test_sm)

In [183]:
cm_dt_base_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_dt_base_sm = pd.DataFrame(data=cm_dt_base_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_base_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,61,199
Akt 0,182,1367


In [184]:
tp_dt_sm = cm_dt_base_sm['Pred 1'][0]
tn_dt_sm = cm_dt_base_sm['Pred 0'][1]
fp_dt_sm = cm_dt_base_sm['Pred 1'][1]
fn_dt_sm = cm_dt_base_sm['Pred 0'][0]

## Random Forest

### Random Over Sampling

In [185]:
random_forest = RandomForestClassifier()
random_forest.fit(x_train_os, y_train_os)
pred_train_os = random_forest.predict(x_train_os)
pred_test_os = random_forest.predict(x_test)

In [186]:
rf_acc_base_train_os = accuracy_score(y_train_os, pred_train_os)
rf_acc_base_test_os = accuracy_score(y_test, pred_test_os)
rf_recall_base_train_os = recall_score(y_train_os, pred_train_os)
rf_recall_base_test_os = recall_score(y_test, pred_test_os)
rf_prec_base_train_os = precision_score(y_train_os, pred_train_os)
rf_prec_base_test_os = precision_score(y_test, pred_test_os)
rf_f1_base_train_os = f1_score(y_train_os, pred_train_os)
rf_f1_base_test_os = f1_score(y_test, pred_test_os)

In [187]:
cm_rf_base_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_rf_base_os = pd.DataFrame(data=cm_rf_base_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_base_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,203,57
Akt 0,7,1542


In [188]:
tp_rf_os = cm_rf_base_os['Pred 1'][0]
tn_rf_os = cm_rf_base_os['Pred 0'][1]
fp_rf_os = cm_rf_base_os['Pred 1'][1]
fn_rf_os = cm_rf_base_os['Pred 0'][0]

### Smote

In [189]:
random_forest.fit(x_train_sm, y_train_sm)
pred_train_sm = random_forest.predict(x_train_sm)
pred_test_sm = random_forest.predict(x_test)

In [190]:
rf_acc_base_train_sm = accuracy_score(y_train_sm, pred_train_sm)
rf_acc_base_test_sm = accuracy_score(y_test, pred_test_sm)
rf_recall_base_train_sm = recall_score(y_train_sm, pred_train_sm)
rf_recall_base_test_sm = recall_score(y_test, pred_test_sm)
rf_prec_base_train_sm = precision_score(y_train_sm, pred_train_sm)
rf_prec_base_test_sm = precision_score(y_test, pred_test_sm)
rf_f1_base_train_sm = f1_score(y_train_sm, pred_train_sm)
rf_f1_base_test_sm = f1_score(y_test, pred_test_sm)

In [191]:
cm_rf_base_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_rf_base_sm = pd.DataFrame(data=cm_rf_base_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_base_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,57,203
Akt 0,17,1532


In [192]:
tp_rf_sm = cm_rf_base_sm['Pred 1'][0]
tn_rf_sm = cm_rf_base_sm['Pred 0'][1]
fp_rf_sm = cm_rf_base_sm['Pred 1'][1]
fn_rf_sm = cm_rf_base_sm['Pred 0'][0]

## XGBoost

### Random Over Sampling

In [193]:
xgb = XGBClassifier()
xgb.fit(x_train_os, y_train_os)
pred_train_os = xgb.predict(x_train_os)
pred_test_os = xgb.predict(x_test)

In [194]:
xgb_acc_base_train_os = accuracy_score(y_train_os, pred_train_os)
xgb_acc_base_test_os = accuracy_score(y_test, pred_test_os)
xgb_recall_base_train_os = recall_score(y_train_os, pred_train_os)
xgb_recall_base_test_os = recall_score(y_test, pred_test_os)
xgb_prec_base_train_os = precision_score(y_train_os, pred_train_os)
xgb_prec_base_test_os = precision_score(y_test, pred_test_os)
xgb_f1_base_train_os = f1_score(y_train_os, pred_train_os)
xgb_f1_base_test_os = f1_score(y_test, pred_test_os)

In [195]:
cm_xgb_base_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_xgb_base_os = pd.DataFrame(data=cm_xgb_base_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_xgb_base_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,153,107
Akt 0,32,1517


In [196]:
tp_xgb_os = cm_xgb_base_os['Pred 1'][0]
tn_xgb_os = cm_xgb_base_os['Pred 0'][1]
fp_xgb_os = cm_xgb_base_os['Pred 1'][1]
fn_xgb_os = cm_xgb_base_os['Pred 0'][0]

### Smote

In [197]:
xgb.fit(x_train_sm, y_train_sm)
pred_train_sm = xgb.predict(x_train_sm)
pred_test_sm = xgb.predict(x_test)

In [198]:
xgb_acc_base_train_sm = accuracy_score(y_train_sm, pred_train_sm)
xgb_acc_base_test_sm = accuracy_score(y_test, pred_test_sm)
xgb_recall_base_train_sm = recall_score(y_train_sm, pred_train_sm)
xgb_recall_base_test_sm = recall_score(y_test, pred_test_sm)
xgb_prec_base_train_sm = precision_score(y_train_sm, pred_train_sm)
xgb_prec_base_test_sm = precision_score(y_test, pred_test_sm)
xgb_f1_base_train_sm = f1_score(y_train_sm, pred_train_sm)
xgb_f1_base_test_sm = f1_score(y_test, pred_test_sm)

In [199]:
cm_xgb_base_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_xgb_base_sm = pd.DataFrame(data=cm_xgb_base_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_xgb_base_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,44,216
Akt 0,27,1522


In [200]:
tp_xgb_sm = cm_xgb_base_sm['Pred 1'][0]
tn_xgb_sm = cm_xgb_base_sm['Pred 0'][1]
fp_xgb_sm = cm_xgb_base_sm['Pred 1'][1]
fn_xgb_sm = cm_xgb_base_sm['Pred 0'][0]

## Evaluation Matrix For DT, RF, XGB

In [201]:
base_os = {
    "DT_OS_Train": [dt_acc_base_train_os, dt_recall_base_train_os, dt_prec_base_train_os, dt_f1_base_train_os],
    "DT_OS_Test" : [dt_acc_base_test_os, dt_recall_base_test_os, dt_prec_base_test_os, dt_f1_base_test_os],
    "RF_OS_Train": [rf_acc_base_train_os, rf_recall_base_train_os, rf_prec_base_train_os, rf_f1_base_train_os],
    "RF_OS_Test" : [rf_acc_base_test_os, rf_recall_base_test_os, rf_prec_base_test_os, rf_f1_base_test_os],
    "XGB_OS_Train": [xgb_acc_base_train_os, xgb_recall_base_train_os, xgb_prec_base_train_os, xgb_f1_base_train_os],
    "XGB_OS_Test" : [xgb_acc_base_test_os, xgb_recall_base_test_os, xgb_prec_base_test_os, xgb_f1_base_test_os]
    }
base_os_matrix = pd.DataFrame(data = base_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

base_sm = {
    "DT_SM_Train": [dt_acc_base_train_sm, dt_recall_base_train_sm, dt_prec_base_train_sm, dt_f1_base_train_sm],
    "DT_SM_Test" : [dt_acc_base_test_sm, dt_recall_base_test_sm, dt_prec_base_test_sm, dt_f1_base_test_sm],
    "RF_SM_Train": [rf_acc_base_train_sm, rf_recall_base_train_sm, rf_prec_base_train_sm, rf_f1_base_train_sm],
    "RF_SM_Test" : [rf_acc_base_test_sm, rf_recall_base_test_sm, rf_prec_base_test_sm, rf_f1_base_test_sm],
    "XGB_SM_Train": [xgb_acc_base_train_sm, xgb_recall_base_train_sm, xgb_prec_base_train_sm, xgb_f1_base_train_sm],
    "XGB_SM_Test" : [xgb_acc_base_test_sm, xgb_recall_base_test_sm, xgb_prec_base_test_sm, xgb_f1_base_test_sm]
    }
base_sm_matrix = pd.DataFrame(data = base_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

In [202]:
summary = {
    "Accuracy" : [dt_acc_base_train_os, dt_acc_base_test_os, 
                  rf_acc_base_train_os, rf_acc_base_test_os,
                  xgb_acc_base_train_os, xgb_acc_base_test_os,
                  dt_acc_base_train_sm, dt_acc_base_test_sm,
                  rf_acc_base_train_sm, rf_acc_base_test_sm, 
                  xgb_acc_base_train_sm, xgb_acc_base_test_sm],
    
    "Precision" : [dt_prec_base_train_os, dt_prec_base_test_os,
                   rf_prec_base_train_os, rf_prec_base_test_os,
                   xgb_prec_base_train_os, xgb_prec_base_test_os,
                   dt_prec_base_train_sm, dt_prec_base_test_sm,
                   rf_prec_base_train_sm, rf_prec_base_test_sm,
                   xgb_prec_base_train_sm, xgb_prec_base_test_sm],
    
    "Recall": [dt_recall_base_train_os, dt_recall_base_test_os,
               rf_recall_base_train_os, rf_recall_base_test_os, 
               xgb_recall_base_train_os, xgb_recall_base_test_os,
               dt_recall_base_train_sm, dt_recall_base_test_sm,
               rf_recall_base_train_sm, rf_recall_base_test_sm, 
               xgb_recall_base_train_sm, xgb_recall_base_test_sm],
    
    "F1 Score": [dt_f1_base_train_os, dt_f1_base_test_os, 
                 rf_f1_base_train_os, rf_f1_base_test_os,
                 xgb_f1_base_train_os, xgb_f1_base_test_os,
                 dt_f1_base_train_sm, dt_f1_base_test_sm, 
                 rf_f1_base_train_sm, rf_f1_base_test_sm,
                 xgb_f1_base_train_sm, xgb_f1_base_test_sm]}
    
sum_matrix = pd.DataFrame(data = summary, index = ['Decision Tree Train OS', 'Decision Tree Test OS',
                                                   'Random Forest Train OS', 'Random Forest Test OS',
                                                   'XGBoost Train OS', 'XGBoost Test OS', 
                                                   'Decision Tree Train SM', 'Decision Tree Test SM',
                                                   'Random Forest Train SM', 'Random Forest Test SM',
                                                   'XGBoost Train SM', 'XGBoost Test SM'])
# sum_matrix.sort_values('F1 Score', ascending = False)
sum_matrix

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Decision Tree Train OS,1.0,1.0,1.0,1.0
Decision Tree Test OS,0.90105,0.723757,0.503846,0.594104
Random Forest Train OS,1.0,1.0,1.0,1.0
Random Forest Test OS,0.964621,0.966667,0.780769,0.86383
XGBoost Train OS,0.999112,0.998227,1.0,0.999113
XGBoost Test OS,0.923162,0.827027,0.588462,0.68764
Decision Tree Train SM,1.0,1.0,1.0,1.0
Decision Tree Test SM,0.789386,0.251029,0.234615,0.242545
Random Forest Train SM,1.0,1.0,1.0,1.0
Random Forest Test SM,0.878386,0.77027,0.219231,0.341317


In [203]:
base_os_matrix
#XGB with Random Over Sampling

Unnamed: 0,DT_OS_Train,DT_OS_Test,RF_OS_Train,RF_OS_Test,XGB_OS_Train,XGB_OS_Test
Accuracy,1.0,0.90105,1.0,0.964621,0.999112,0.923162
Recall,1.0,0.503846,1.0,0.780769,1.0,0.588462
Precision,1.0,0.723757,1.0,0.966667,0.998227,0.827027
F1 Score,1.0,0.594104,1.0,0.86383,0.999113,0.68764


In [204]:
base_sm_matrix

Unnamed: 0,DT_SM_Train,DT_SM_Test,RF_SM_Train,RF_SM_Test,XGB_SM_Train,XGB_SM_Test
Accuracy,1.0,0.789386,1.0,0.878386,0.999758,0.865672
Recall,1.0,0.234615,1.0,0.219231,0.999516,0.169231
Precision,1.0,0.251029,1.0,0.77027,1.0,0.619718
F1 Score,1.0,0.242545,1.0,0.341317,0.999758,0.265861


In [205]:
cm = {
    "True Positive" : [tp_svm_os, tp_svm_os_std, tp_svm_os_mm, tp_svm_os_rb, 
                       tp_svm_sm, tp_svm_sm_std, tp_svm_sm_mm, tp_svm_sm_rb,
                       tp_logreg_os, tp_logreg_os_std, tp_logreg_os_mm, tp_logreg_os_rb, 
                       tp_logreg_sm, tp_logreg_sm_std, tp_logreg_sm_mm, tp_logreg_sm_rb,
                       tp_dt_os, tp_dt_sm, tp_rf_os, tp_rf_sm, tp_xgb_os, tp_xgb_sm],
    
    "True Negative" : [tn_svm_os, tn_svm_os_std, tn_svm_os_mm, tn_svm_os_rb, 
                       tn_svm_sm, tn_svm_sm_std, tn_svm_sm_mm, tn_svm_sm_rb,
                       tn_logreg_os, tn_logreg_os_std, tn_logreg_os_mm, tn_logreg_os_rb, 
                       tn_logreg_sm, tn_logreg_sm_std, tn_logreg_sm_mm, tn_logreg_sm_rb,
                       tn_dt_os, tn_dt_sm, tn_rf_os, tn_rf_sm, tn_xgb_os, tn_xgb_sm],
    
    "False Positive": [fp_svm_os, fp_svm_os_std, fp_svm_os_mm, fp_svm_os_rb, 
                       fp_svm_sm, fp_svm_sm_std, fp_svm_sm_mm, fp_svm_sm_rb,
                       fp_logreg_os, fp_logreg_os_std, fp_logreg_os_mm, fp_logreg_os_rb, 
                       fp_logreg_sm, fp_logreg_sm_std, fp_logreg_sm_mm, fp_logreg_sm_rb,
                       fp_dt_os, fp_dt_sm, fp_rf_os, fp_rf_sm, fp_xgb_os, fp_xgb_sm],
    
    "False Negative": [fn_svm_os, fn_svm_os_std, fn_svm_os_mm, fn_svm_os_rb, 
                       fn_svm_sm, fn_svm_sm_std, fn_svm_sm_mm, fn_svm_sm_rb,
                       fn_logreg_os, fn_logreg_os_std, fn_logreg_os_mm, fn_logreg_os_rb, 
                       fn_logreg_sm, fn_logreg_sm_std, fn_logreg_sm_mm, fn_logreg_sm_rb,
                       fn_dt_os, fn_dt_sm, fn_rf_os, fn_rf_sm, fn_xgb_os, fn_xgb_sm]
}
    
cm_matrix = pd.DataFrame(data = cm, index = ['SVM OS', 'SVM OS Standard', 'SVM OS MinMax', 'SVM OS Robust',
                                             'SVM SM', 'SVM SM Standard', 'SVM SM MinMax', 'SVM SM Robust',
                                             'LogReg OS', 'Logreg OS Standard', 'Logreg OS MinMax', 'Logreg OS Robust',
                                             'LogReg SM', 'Logreg SM Standard', 'Logreg SM MinMax', 'Logreg SM Robust',
                                             'Decision Tree OS', 'Decision Tree SM', 
                                             'Random Forest OS', 'Random Forest SM',
                                             'XGBoost OS', 'XGBoost SM'])
cm_matrix.sort_values('False Negative')
#SVM with Random Over Sampling and Standard Scaling

Unnamed: 0,True Positive,True Negative,False Positive,False Negative
SVM OS Standard,240,1344,205,20
SVM OS Robust,240,1342,207,20
SVM OS MinMax,238,1307,242,22
Random Forest OS,203,1542,7,57
Logreg OS Standard,194,1090,459,66
Logreg OS Robust,191,1095,454,69
LogReg OS,191,1095,454,69
Logreg OS MinMax,184,1124,425,76
SVM OS,183,658,891,77
SVM SM,182,668,881,78
