In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [28]:
df = pd.read_csv("train.csv")
df.set_index('id', inplace = True)
df.index.name = None
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [29]:
#Encode
df['Gender'] = df['Gender'].map({'Male' : 0, 'Female' : 1})
df['Vehicle_Age'] = df['Vehicle_Age'].map({'< 1 Year' : 0, '1-2 Year' : 1, '> 2 Years' : 2})
df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'No' : 0, 'Yes' : 1})
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
1,0,44,1,28.0,0,2,1,40454.0,26.0,217,1
2,0,76,1,3.0,0,1,0,33536.0,26.0,183,0
3,0,47,1,28.0,0,2,1,38294.0,26.0,27,1
4,0,21,1,11.0,1,0,0,28619.0,152.0,203,0
5,1,29,1,41.0,1,0,0,27496.0,152.0,39,0


In [30]:
cor = df.corr()
cor['Response'].sort_values(ascending = False)[1:]

Vehicle_Damage          0.354400
Vehicle_Age             0.221874
Age                     0.111147
Annual_Premium          0.022575
Region_Code             0.010570
Driving_License         0.010155
Vintage                -0.001050
Gender                 -0.052440
Policy_Sales_Channel   -0.139042
Previously_Insured     -0.341170
Name: Response, dtype: float64

In [31]:
x = df.drop(columns = ['Response'] , axis = 1)
y = df['Response']

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, stratify= y, random_state = 42)

In [33]:
y_train.value_counts()

0    267519
1     37368
Name: Response, dtype: int64

### Random Over Sampling

In [34]:
df_train = pd.concat([x_train, y_train], axis=1)
not_renewal = df_train[df_train['Response'] == 0]
renewal = df_train[df_train['Response'] == 1]

renewal_oversample = resample(renewal, replace=True, n_samples = len(not_renewal), random_state = 42)
df_OverSampled = pd.concat([not_renewal, renewal_oversample])
df_OverSampled['Response'].value_counts()

1    267519
0    267519
Name: Response, dtype: int64

In [35]:
x_train_os = df_OverSampled.drop(columns = ['Response'])
y_train_os = df_OverSampled['Response']

### Smote

In [36]:
sm = SMOTE(random_state=42)
x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)
x = pd.DataFrame(data = x_train_sm, columns = x_train.columns)
y = pd.DataFrame(data = y_train_sm, columns = ['Response'])
df_smote = x.join(y)
# df_smote = pd.concat([x_train_sm, y_train_sm], axis = 1)
df_smote['Response'].value_counts()

1    267519
0    267519
Name: Response, dtype: int64

## Random Forest

In [37]:
random_forest = RandomForestClassifier()

### Random Forest Random Over Sampling

In [38]:
random_forest.fit(x_train_os, y_train_os)

RandomForestClassifier()

In [39]:
pred_train_os = random_forest.predict(x_train_os)
pred_test_os = random_forest.predict(x_test)

In [40]:
rf_acc_base_train_os = accuracy_score(y_train_os, pred_train_os)
rf_acc_base_test_os = accuracy_score(y_test, pred_test_os)
rf_recall_base_train_os = recall_score(y_train_os, pred_train_os)
rf_recall_base_test_os = recall_score(y_test, pred_test_os)
rf_prec_base_train_os = precision_score(y_train_os, pred_train_os)
rf_prec_base_test_os = precision_score(y_test, pred_test_os)
rf_f1_base_train_os = f1_score(y_train_os, pred_train_os)
rf_f1_base_test_os = f1_score(y_test, pred_test_os)

In [41]:
cm_rf_base_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_rf_base_os = pd.DataFrame(data=cm_rf_base_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_base_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,2492,6850
Akt 0,4774,62106


In [42]:
print(classification_report(pred_test_os, y_test))

              precision    recall  f1-score   support

           0       0.93      0.90      0.91     68956
           1       0.27      0.34      0.30      7266

    accuracy                           0.85     76222
   macro avg       0.60      0.62      0.61     76222
weighted avg       0.87      0.85      0.86     76222



In [43]:
tp_rf_os = cm_rf_base_os['Pred 1'][0]
tn_rf_os = cm_rf_base_os['Pred 0'][1]
fp_rf_os = cm_rf_base_os['Pred 1'][1]
fn_rf_os = cm_rf_base_os['Pred 0'][0]

### Random Forest Smote

In [44]:
random_forest.fit(x_train_sm, y_train_sm)

RandomForestClassifier()

In [45]:
pred_train_sm = random_forest.predict(x_train_sm)
pred_test_sm = random_forest.predict(x_test)

In [46]:
rf_acc_base_train_sm = accuracy_score(y_train_sm, pred_train_sm)
rf_acc_base_test_sm = accuracy_score(y_test, pred_test_sm)
rf_recall_base_train_sm = recall_score(y_train_sm, pred_train_sm)
rf_recall_base_test_sm = recall_score(y_test, pred_test_sm)
rf_prec_base_train_sm = precision_score(y_train_sm, pred_train_sm)
rf_prec_base_test_sm = precision_score(y_test, pred_test_sm)
rf_f1_base_train_sm = f1_score(y_train_sm, pred_train_sm)
rf_f1_base_test_sm = f1_score(y_test, pred_test_sm)

In [47]:
cm_rf_base_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_rf_base_sm = pd.DataFrame(data=cm_rf_base_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_base_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,3990,5352
Akt 0,8543,58337


In [48]:
print(classification_report(pred_test_sm, y_test))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89     63689
           1       0.43      0.32      0.36     12533

    accuracy                           0.82     76222
   macro avg       0.65      0.62      0.63     76222
weighted avg       0.80      0.82      0.81     76222



In [49]:
tp_rf_sm = cm_rf_base_sm['Pred 1'][0]
tn_rf_sm = cm_rf_base_sm['Pred 0'][1]
fp_rf_sm = cm_rf_base_sm['Pred 1'][1]
fn_rf_sm = cm_rf_base_sm['Pred 0'][0]

## XGBoost

In [50]:
xgb = XGBClassifier()

### XGBoost Random Over Sampling

In [51]:
xgb.fit(x_train_os, y_train_os)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [52]:
pred_train_os = xgb.predict(x_train_os)
pred_test_os = xgb.predict(x_test)

In [53]:
xgb_acc_base_train_os = accuracy_score(y_train_os, pred_train_os)
xgb_acc_base_test_os = accuracy_score(y_test, pred_test_os)
xgb_recall_base_train_os = recall_score(y_train_os, pred_train_os)
xgb_recall_base_test_os = recall_score(y_test, pred_test_os)
xgb_prec_base_train_os = precision_score(y_train_os, pred_train_os)
xgb_prec_base_test_os = precision_score(y_test, pred_test_os)
xgb_f1_base_train_os = f1_score(y_train_os, pred_train_os)
xgb_f1_base_test_os = f1_score(y_test, pred_test_os)

In [54]:
cm_xgb_base_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])
cm_xgb_base_os = pd.DataFrame(data=cm_xgb_base_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_xgb_base_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,8447,895
Akt 0,20669,46211


In [55]:
print(classification_report(pred_test_os, y_test))

              precision    recall  f1-score   support

           0       0.69      0.98      0.81     47106
           1       0.90      0.29      0.44     29116

    accuracy                           0.72     76222
   macro avg       0.80      0.64      0.63     76222
weighted avg       0.77      0.72      0.67     76222



In [56]:
tp_xgb_os = cm_xgb_base_os['Pred 1'][0]
tn_xgb_os = cm_xgb_base_os['Pred 0'][1]
fp_xgb_os = cm_xgb_base_os['Pred 1'][1]
fn_xgb_os = cm_xgb_base_os['Pred 0'][0]

### XGBoost Smote

In [57]:
xgb.fit(x_train_sm, y_train_sm)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [58]:
pred_train_sm = xgb.predict(x_train_sm)
pred_test_sm = xgb.predict(x_test)

In [59]:
xgb_acc_base_train_sm = accuracy_score(y_train_sm, pred_train_sm)
xgb_acc_base_test_sm = accuracy_score(y_test, pred_test_sm)
xgb_recall_base_train_sm = recall_score(y_train_sm, pred_train_sm)
xgb_recall_base_test_sm = recall_score(y_test, pred_test_sm)
xgb_prec_base_train_sm = precision_score(y_train_sm, pred_train_sm)
xgb_prec_base_test_sm = precision_score(y_test, pred_test_sm)
xgb_f1_base_train_sm = f1_score(y_train_sm, pred_train_sm)
xgb_f1_base_test_sm = f1_score(y_test, pred_test_sm)

In [60]:
cm_xgb_base_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])
cm_xgb_base_sm = pd.DataFrame(data=cm_xgb_base_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_xgb_base_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,3622,5720
Akt 0,6791,60089


In [61]:
print(classification_report(pred_test_sm, y_test))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91     65809
           1       0.39      0.35      0.37     10413

    accuracy                           0.84     76222
   macro avg       0.64      0.63      0.64     76222
weighted avg       0.83      0.84      0.83     76222



In [62]:
tp_xgb_sm = cm_xgb_base_sm['Pred 1'][0]
tn_xgb_sm = cm_xgb_base_sm['Pred 0'][1]
fp_xgb_sm = cm_xgb_base_sm['Pred 1'][1]
fn_xgb_sm = cm_xgb_base_sm['Pred 0'][0]

### Evaluation Matrix

In [63]:
base = {
    "RF_OS_Train": [rf_acc_base_train_os, rf_recall_base_train_os, rf_prec_base_train_os, rf_f1_base_train_os],
    "RF_OS_Test" : [rf_acc_base_test_os, rf_recall_base_test_os, rf_prec_base_test_os, rf_f1_base_test_os],
    "RF_SM_Train": [rf_acc_base_train_sm, rf_recall_base_train_sm, rf_prec_base_train_sm, rf_f1_base_train_sm],
    "RF_SM_Test" : [rf_acc_base_test_sm, rf_recall_base_test_sm, rf_prec_base_test_sm, rf_f1_base_test_sm],
    "XGB_OS_Train": [xgb_acc_base_train_os, xgb_recall_base_train_os, xgb_prec_base_train_os, xgb_f1_base_train_os],
    "XGB_OS_Test" : [xgb_acc_base_test_os, xgb_recall_base_test_os, xgb_prec_base_test_os, xgb_f1_base_test_os],
    "XGB_SM_Train": [xgb_acc_base_train_sm, xgb_recall_base_train_sm, xgb_prec_base_train_sm, xgb_f1_base_train_sm],
    "XGB_SM_Test" : [xgb_acc_base_test_sm, xgb_recall_base_test_sm, xgb_prec_base_test_sm, xgb_f1_base_test_sm]
    }
base_matrix = pd.DataFrame(data = base, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])
base_matrix

Unnamed: 0,RF_OS_Train,RF_OS_Test,RF_SM_Train,RF_SM_Test,XGB_OS_Train,XGB_OS_Test,XGB_SM_Train,XGB_SM_Test
Accuracy,0.999936,0.847498,0.999809,0.817704,0.819409,0.71709,0.896837,0.835861
Recall,1.0,0.266752,0.999914,0.427103,0.944228,0.904196,0.893353,0.387711
Precision,0.999873,0.342967,0.999705,0.31836,0.755601,0.290115,0.899622,0.347834
F1 Score,0.999936,0.300096,0.999809,0.3648,0.839449,0.439284,0.896477,0.366692


In [64]:
cm = {
    "True Positive" : [tp_rf_os, tp_rf_sm, tp_xgb_os, tp_xgb_sm],
    
    "True Negative" : [tn_rf_os, tn_rf_sm, tn_xgb_os, tn_xgb_sm],
    
    "False Positive": [fp_rf_os, fp_rf_sm, fp_xgb_os, fp_xgb_sm],
    
    "False Negative": [fn_rf_os, fn_rf_sm, fn_xgb_os, fn_xgb_sm]
}
    
cm_matrix = pd.DataFrame(data = cm, index = ['Random Forest OS', 'Random Forest SM',
                                             'XGBoost OS', 'XGBoost SM'])
cm_matrix.sort_values('False Negative')

Unnamed: 0,True Positive,True Negative,False Positive,False Negative
XGBoost OS,8447,46211,20669,895
Random Forest SM,3990,58337,8543,5352
XGBoost SM,3622,60089,6791,5720
Random Forest OS,2492,62106,4774,6850
