# Import Library

In [2]:
import numpy as np
import pandas as pd

import time
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [4]:
data = pd.read_csv('PredictionInsurance.csv')
data.head(1)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1


In [5]:
data.shape

(381109, 12)

In [6]:
data.isna().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [7]:
data.duplicated().sum()

0

In [8]:
data.dtypes

id                       int64
Gender                  object
Age                      int64
Driving_License          int64
Region_Code              int64
Previously_Insured       int64
Vehicle_Age             object
Vehicle_Damage          object
Annual_Premium           int64
Policy_Sales_Channel     int64
Vintage                  int64
Response                 int64
dtype: object

In [9]:
data['Response'].value_counts()

Response
0    334399
1     46710
Name: count, dtype: int64

# EDA

In [10]:
data.groupby(['Previously_Insured','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Previously_Insured,Unnamed: 1_level_2,Unnamed: 2_level_2
0,159929,46552
1,174470,158


In [11]:
data.groupby(['Gender','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2
Female,156835,18185
Male,177564,28525


In [12]:
data.groupby(['Driving_License','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Driving_License,Unnamed: 1_level_2,Unnamed: 2_level_2
0,771,41
1,333628,46669


In [13]:
data = data[['Gender','Previously_Insured','Driving_License','Response']]
data.head()

Unnamed: 0,Gender,Previously_Insured,Driving_License,Response
0,Male,0,1,1
1,Male,0,1,0
2,Male,0,1,1
3,Male,1,1,0
4,Female,1,1,0


# Preprocessing

In [14]:
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})
data.head()

Unnamed: 0,Gender,Previously_Insured,Driving_License,Response
0,0,0,1,1
1,0,0,1,0
2,0,0,1,1
3,0,1,1,0
4,1,1,1,0


# Data Modeling

In [15]:
X = data.drop(['Response'], axis=1)
y = data['Response']

Implementing SMOTE for Imbalance Dataset

In [16]:
smote = SMOTE(sampling_strategy='minority')
X, y = smote.fit_resample(X, y)
y.value_counts()

Response
1    334399
0    334399
Name: count, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
start = time.time()
model = DecisionTreeClassifier(random_state=0)

model.fit(X_train, y_train)
stop = time.time()
print(f"Training time {stop-start} Sekon")

Training time 0.4192168712615967 Sekon


In [19]:
accuracy = model.score(X_train, y_train)
print(accuracy)

0.7591386032393961


# Model Export

In [21]:
with open('modelDT','wb') as file:
    pickle.dump(model, file)

# Model Evaluation

In [20]:
y_predict = model.predict(X_test)
print(classification_report(y_predict, y_test))

              precision    recall  f1-score   support

           0       0.53      0.99      0.69     35417
           1       1.00      0.68      0.81     98343

    accuracy                           0.76    133760
   macro avg       0.76      0.83      0.75    133760
weighted avg       0.87      0.76      0.78    133760

