### Technique We will have to follow:
1. PCA - Principal Component Analysis

In [1]:
#### Import all necessity functions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.cluster.hierarchy as shc
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, silhouette_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
###### import the dataset ######
df = pd.read_csv('aug_train.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,167647,Male,22,1,7.0,1,< 1 Year,No,2630.0,152.0,16,0
1,17163,Male,42,1,28.0,0,1-2 Year,Yes,43327.0,26.0,135,0
2,32023,Female,66,1,33.0,0,1-2 Year,Yes,35841.0,124.0,253,0
3,87447,Female,22,1,33.0,0,< 1 Year,No,27645.0,152.0,69,0
4,501933,Male,28,1,46.0,1,< 1 Year,No,29023.0,152.0,211,0


In [3]:
##### Find the value counts of TARGET column
df.loc[:, 'Response'].value_counts()
##### It is an imbalanced dataset

0    319553
1     62601
Name: Response, dtype: int64

In [4]:
df.loc[:, 'Vehicle_Age'] = df.loc[:, 'Vehicle_Age'].map({value_: index_ for index_, value_, in  enumerate(list(df.groupby(['Vehicle_Age'])['Response'].mean().sort_values(ascending = True).index))})
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,167647,Male,22,1,7.0,1,0,No,2630.0,152.0,16,0
1,17163,Male,42,1,28.0,0,1,Yes,43327.0,26.0,135,0
2,32023,Female,66,1,33.0,0,1,Yes,35841.0,124.0,253,0
3,87447,Female,22,1,33.0,0,0,No,27645.0,152.0,69,0
4,501933,Male,28,1,46.0,1,0,No,29023.0,152.0,211,0


In [5]:
df.loc[:, 'Vehicle_Damage'] = df.loc[:, 'Vehicle_Damage'].map({'No': 0, 'Yes': 1})
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,167647,Male,22,1,7.0,1,0,0,2630.0,152.0,16,0
1,17163,Male,42,1,28.0,0,1,1,43327.0,26.0,135,0
2,32023,Female,66,1,33.0,0,1,1,35841.0,124.0,253,0
3,87447,Female,22,1,33.0,0,0,0,27645.0,152.0,69,0
4,501933,Male,28,1,46.0,1,0,0,29023.0,152.0,211,0


In [6]:
##### delete the ID column
df.drop(['id'], axis = 1, inplace = True)
print("ID column is dropped.")

ID column is dropped.


In [7]:
##### Convert the Gender column 
df.loc[:, 'Gender'] = df.loc[:, 'Gender'].map({'Male': 0, 'Female': 1})
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,22,1,7.0,1,0,0,2630.0,152.0,16,0
1,0,42,1,28.0,0,1,1,43327.0,26.0,135,0
2,1,66,1,33.0,0,1,1,35841.0,124.0,253,0
3,1,22,1,33.0,0,0,0,27645.0,152.0,69,0
4,0,28,1,46.0,1,0,0,29023.0,152.0,211,0


In [8]:
##### Implementing Principal Component Analysis #####
from sklearn.decomposition import PCA

In [9]:
y = df.iloc[:, -1]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Response, dtype: int64

In [10]:
##### Scaling the dataset
scaler_ = StandardScaler()
df = pd.DataFrame(scaler_.fit_transform(df.iloc[:, :-1]), columns = df.columns[:-1])
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,-0.92666,-1.086611,0.043537,-1.472312,1.021874,-1.069485,-0.961873,-1.645878,0.737941,-1.650319
1,-0.92666,0.226856,0.043537,0.120864,-0.978594,0.696557,1.039638,0.739424,-1.583081,-0.229169
2,1.079145,1.803016,0.043537,0.500191,-0.978594,0.696557,1.039638,0.30066,0.222158,1.180039
3,1.079145,-1.086611,0.043537,0.500191,-0.978594,-1.069485,-0.961873,-0.179718,0.737941,-1.017369
4,-0.92666,-0.692571,0.043537,1.486443,1.021874,-1.069485,-0.961873,-0.098952,0.737941,0.678457


In [11]:
df = pd.concat([df, y], axis = 1)

In [12]:
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,-0.92666,-1.086611,0.043537,-1.472312,1.021874,-1.069485,-0.961873,-1.645878,0.737941,-1.650319,0
1,-0.92666,0.226856,0.043537,0.120864,-0.978594,0.696557,1.039638,0.739424,-1.583081,-0.229169,0
2,1.079145,1.803016,0.043537,0.500191,-0.978594,0.696557,1.039638,0.30066,0.222158,1.180039,0
3,1.079145,-1.086611,0.043537,0.500191,-0.978594,-1.069485,-0.961873,-0.179718,0.737941,-1.017369,0
4,-0.92666,-0.692571,0.043537,1.486443,1.021874,-1.069485,-0.961873,-0.098952,0.737941,0.678457,0


In [13]:
##### X and Y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

##### split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [14]:
##### Call the PCA #####
pca_ = PCA( n_components = 7)
X_train = pca_.fit_transform(X_train)
X_test  = pca_.transform(X_test)

In [15]:
np.cumsum(pca_.explained_variance_ratio_)

array([0.29158575, 0.42308056, 0.52397594, 0.62400921, 0.72365016,
       0.82253765, 0.9161708 ])

In [16]:
y_train

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [17]:
# easy_ensemble_ = EasyEnsembleClassifier()
# easy_ensemble_.fit(X_train, y_train)
# predicted_ = easy_ensemble_.predict(X_test)
# print("accuracy is = ", accuracy_score(predicted_, y_test))
# print("recall is = ", recall_score(predicted_, y_test))
# print("precision is = ", precision_score(predicted_, y_test))
# print("f1 score is = ", f1_score(predicted_, y_test))

In [18]:
from imblearn.ensemble import BalancedBaggingClassifier
decision_tree_ = BalancedBaggingClassifier()
decision_tree_.fit(X_train, y_train)
predicted_ = decision_tree_.predict(X_test)
print("accuracy is = ", accuracy_score(predicted_, y_test))
print("recall is = ", recall_score(predicted_, y_test))
print("precision is = ", precision_score(predicted_, y_test))
print("f1 score is = ", f1_score(predicted_, y_test))

accuracy is =  0.7880712098877424
recall is =  0.42105724555097745
precision is =  0.7626230549380756
f1 score is =  0.5425585992657441


In [19]:
##### ADASYN upper sampling #####
# adasyn_ = ADASYN()
# X_train, y_train = adasyn_.fit_resample(X_train, y_train)

##### Border line #####
# border_line_ = BorderlineSMOTE(random_state = 42)
# X_train, y_train = border_line_.fit_resample(X_train, y_train)

##### SMOTE #####

# smote_ = SMOTE(random_state = 42)
# X_train, y_train = smote_.fit_resample(X_train, y_train)

smote_ = SVMSMOTE(random_state = 42)
X_train, y_train = smote_.fit_resample(X_train, y_train)

In [20]:
X_train.shape, y_train.shape

((447600, 7), (447600,))

In [21]:
decision_tree_ = DecisionTreeClassifier()
decision_tree_.fit(X_train, y_train)
predicted_ = decision_tree_.predict(X_test)
print("accuracy is = ", accuracy_score(predicted_, y_test))
print("recall is = ", recall_score(predicted_, y_test))
print("precision is = ", precision_score(predicted_, y_test))
print("f1 score is = ", f1_score(predicted_, y_test))

accuracy is =  0.7916822943469956
recall is =  0.40455328104074995
precision is =  0.5595956388271409
f1 score is =  0.4696084745386307


In [22]:
decision_tree_ = RandomForestClassifier(class_weight = 'balanced')
decision_tree_.fit(X_train, y_train)
predicted_ = decision_tree_.predict(X_test)
print("accuracy is = ", accuracy_score(predicted_, y_test))
print("recall is = ", recall_score(predicted_, y_test))
print("precision is = ", precision_score(predicted_, y_test))
print("f1 score is = ", f1_score(predicted_, y_test))

accuracy is =  0.8026812738231266
recall is =  0.4382985766302549
precision is =  0.7008044881973113
f1 score is =  0.5393043336591724


In [23]:
decision_tree_ = GradientBoostingClassifier()
decision_tree_.fit(X_train, y_train)
predicted_ = decision_tree_.predict(X_test)
print("accuracy is = ", accuracy_score(predicted_, y_test))
print("recall is = ", recall_score(predicted_, y_test))
print("precision is = ", precision_score(predicted_, y_test))
print("f1 score is = ", f1_score(predicted_, y_test))

accuracy is =  0.762889565361501
recall is =  0.4043829296424452
precision is =  0.9278077696623267
f1 score is =  0.5632671422145106


In [24]:
decision_tree_ = XGBClassifier()
decision_tree_.fit(X_train, y_train)
predicted_ = decision_tree_.predict(X_test)
print("accuracy is = ", accuracy_score(predicted_, y_test))
print("recall is = ", recall_score(predicted_, y_test))
print("precision is = ", precision_score(predicted_, y_test))
print("f1 score is = ", f1_score(predicted_, y_test))

accuracy is =  0.7760778738213822
recall is =  0.4170379436964504
precision is =  0.9016619032497089
f1 score is =  0.570299946438136


In [25]:
decision_tree_ = AdaBoostClassifier()
decision_tree_.fit(X_train, y_train)
predicted_ = decision_tree_.predict(X_test)
print("accuracy is = ", accuracy_score(predicted_, y_test))
print("recall is = ", recall_score(predicted_, y_test))
print("precision is = ", precision_score(predicted_, y_test))
print("f1 score is = ", f1_score(predicted_, y_test))

accuracy is =  0.7599326628695038
recall is =  0.4006585158066819
precision is =  0.9209802053561977
f1 score is =  0.5583955074207781
