In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import pickle

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.ensemble import GradientBoostingClassifier

In [4]:
df = pd.read_csv("Homework - Prediction Insurance.csv")

In [5]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1
1,2,Male,76,1,3,0,1-2 Year,No,33536,26,183,0
2,3,Male,47,1,28,0,> 2 Years,Yes,38294,26,27,1
3,4,Male,21,1,11,1,< 1 Year,No,28619,152,203,0
4,5,Female,29,1,41,1,< 1 Year,No,27496,152,39,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   id                    381109 non-null  int64 
 1   Gender                381109 non-null  object
 2   Age                   381109 non-null  int64 
 3   Driving_License       381109 non-null  int64 
 4   Region_Code           381109 non-null  int64 
 5   Previously_Insured    381109 non-null  int64 
 6   Vehicle_Age           381109 non-null  object
 7   Vehicle_Damage        381109 non-null  object
 8   Annual_Premium        381109 non-null  int64 
 9   Policy_Sales_Channel  381109 non-null  int64 
 10  Vintage               381109 non-null  int64 
 11  Response              381109 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 34.9+ MB


In [7]:
df[['Age', 'Driving_License', 'Previously_Insured', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']].describe()

Unnamed: 0,Age,Driving_License,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage
count,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0
mean,38.822584,0.997869,0.45821,30564.389581,112.034295,154.347397
std,15.511611,0.04611,0.498251,17213.155057,54.203995,83.671304
min,20.0,0.0,0.0,2630.0,1.0,10.0
25%,25.0,1.0,0.0,24405.0,29.0,82.0
50%,36.0,1.0,0.0,31669.0,133.0,154.0
75%,49.0,1.0,1.0,39400.0,152.0,227.0
max,85.0,1.0,1.0,540165.0,163.0,299.0


In [8]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [9]:
df['Vehicle_Age'].unique()

array(['> 2 Years', '1-2 Year', '< 1 Year'], dtype=object)

In [10]:
df['Vehicle_Damage'].unique()

array(['Yes', 'No'], dtype=object)

In [11]:
df['Policy_Sales_Channel'].unique()

array([ 26, 152, 160, 124,  14,  13,  30, 156, 163, 157, 122,  19,  22,
        15, 154,  16,  52, 155,  11, 151, 125,  25,  61,   1,  86,  31,
       150,  23,  60,  21, 121,   3, 139,  12,  29,  55,   7,  47, 127,
       153,  78, 158,  89,  32,   8,  10, 120,  65,   4,  42,  83, 136,
        24,  18,  56,  48, 106,  54,  93, 116,  91,  45,   9, 145, 147,
        44, 109,  37, 140, 107, 128, 131, 114, 118, 159, 119, 105, 135,
        62, 138, 129,  88,  92, 111, 113,  73,  36,  28,  35,  59,  53,
       148, 133, 108,  64,  39,  94, 132,  46,  81, 103,  90,  51,  27,
       146,  63,  96,  40,  66, 100,  95, 123,  98,  75,  69, 130, 134,
        49,  97,  38,  17, 110,  80,  71, 117,  58,  20,  76, 104,  87,
        84, 137, 126,  68,  67, 101, 115,  57,  82,  79, 112,  99,  70,
         2,  34,  33,  74, 102, 149,  43,   6,  50, 144, 143,  41],
      dtype=int64)

In [12]:
df['Region_Code'].unique()

array([28,  3, 11, 41, 33,  6, 35, 50, 15, 45,  8, 36, 30, 26, 16, 47, 48,
       19, 39, 23, 37,  5, 17,  2,  7, 29, 46, 27, 25, 13, 18, 20, 49, 22,
       44,  0,  9, 31, 12, 34, 21, 10, 14, 38, 24, 40, 43, 32,  4, 51, 42,
        1, 52], dtype=int64)

In [13]:
df.duplicated().value_counts()

False    381109
dtype: int64

# Preprocess

In [68]:
df2 = df.copy()

In [69]:
df2.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [70]:
cat = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

In [71]:
oneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [72]:
encoded_data_fit = oneHotEncoder.fit(df2[['Gender', 'Vehicle_Age', 'Vehicle_Damage']])

In [73]:
filename = 'ohe_encoder.pkl'
pickle.dump(encoded_data_fit, open(filename, 'wb'))

In [74]:
encoded_data = encoded_data_fit.transform(df2[['Gender', 'Vehicle_Age', 'Vehicle_Damage']])

In [75]:
encoded_columns = encoded_data_fit.get_feature_names_out(['Gender', 'Vehicle_Age', 'Vehicle_Damage'])

In [64]:
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns)

In [65]:
df2[encoded_columns] = encoded_data

In [66]:
df2 = df2.drop(['id', 'Gender', 'Vehicle_Age', 'Vehicle_Damage'], axis=1)

# Feature Selection

In [37]:
def perform_anova(df, continuous_columns, label_column):
    anova_results = {}
    for column in continuous_columns:
        model = ols(f'{column} ~ C({label_column})', data=df).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        p_value = anova_table["PR(>F)"][0]  # Extract p-value
        anova_results[column] = p_value
    return anova_results

In [44]:
continuous_columns = ['Age', 'Annual_Premium', 'Vintage']
label_column = 'Response'

# Perform ANOVA and get p-values
anova_p_values = perform_anova(df_2, continuous_columns, label_column)

{'Age': 0.0, 'Annual_Premium': 3.72231514014103e-44, 'Vintage': 0.5167036522730712}


In [45]:
anova_p_values

{'Age': 0.0,
 'Annual_Premium': 3.72231514014103e-44,
 'Vintage': 0.5167036522730712}

In [61]:
def chi_square_test(df, categorical_columns, label_column):
    insignificant_columns = []
    significant_columns = []
    for column in categorical_columns:
        contingency_table = pd.crosstab(df[column], df[label_column])  # Create contingency table
        chi2, p, dof, expected = chi2_contingency(contingency_table)  # Perform chi-square test
        if p > 0.05:  # Filter based on p-value
            insignificant_columns.append(column)
        else:
            significant_columns.append(column)
    return insignificant_columns, significant_columns

In [62]:
# Specify categorical columns and label column
categorical_columns = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']
label_column = 'Response'

# Perform the Chi-square test and get columns with p-value > 0.05
not_sig, sig = chi_square_test(df, categorical_columns, label_column)

In [63]:
not_sig

[]

In [64]:
sig

['Gender',
 'Driving_License',
 'Region_Code',
 'Previously_Insured',
 'Vehicle_Age',
 'Vehicle_Damage',
 'Policy_Sales_Channel']

In [44]:
df2 = df2.drop(["Vintage"], axis=1)

# Split

In [45]:
data_x = df2.drop('Response', axis=1).copy()

data_y = df2['Response'].copy()

In [46]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=5)

In [47]:
Counter(y_train)

Counter({0: 233959, 1: 32817})

# SMOTE

In [48]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy = 0.65, k_neighbors=5, random_state=5)
x_resample, y_resample = sm.fit_resample(x_train, y_train)
Counter(y_resample)

Counter({0: 233959, 1: 152073})

# Model

In [49]:
gb_clf = GradientBoostingClassifier(random_state=5, learning_rate= 0.3, max_depth= 3, n_estimators=200)

gb_clf.fit(x_train, y_train)

predictiongnb = gb_clf.predict(x_test)
print(confusion_matrix(y_test, predictiongnb))
print("Akurasi dari Gradient Boost adalah: %.2f" % (accuracy_score(y_test, predictiongnb)*100) )
print("Recall dari Gradient Boost adalah:",recall_score(y_test, predictiongnb)*100)
print("Precision dari Gradient Boost adalah:",precision_score(y_test, predictiongnb)*100)
print("")

[[100194    246]
 [ 13653    240]]
Akurasi dari Gradient Boost adalah: 87.84
Recall dari Gradient Boost adalah: 1.7274886633556468
Precision dari Gradient Boost adalah: 49.382716049382715



In [50]:
prediction_test = gb_clf.predict(x_test)
prediction_train = gb_clf.predict(x_train)
training_acc = accuracy_score(y_train, prediction_train)
testing_acc = accuracy_score(y_test, prediction_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))
print(classification_report(y_test, prediction_test))

Training Accuracy: 0.8782011875131196
Testing Accuracy: 0.8784340479126761
              precision    recall  f1-score   support

           0       0.88      1.00      0.94    100440
           1       0.49      0.02      0.03     13893

    accuracy                           0.88    114333
   macro avg       0.69      0.51      0.48    114333
weighted avg       0.83      0.88      0.83    114333



## Use Resampled

In [51]:
gb_clf.fit(x_resample, y_resample)

predictiongnb = gb_clf.predict(x_test)
print(confusion_matrix(y_test, predictiongnb))
print("Akurasi dari Gradient Boost adalah: %.2f" % (accuracy_score(y_test, predictiongnb)*100) )
print("Recall dari Gradient Boost adalah:",recall_score(y_test, predictiongnb)*100)
print("Precision dari Gradient Boost adalah:",precision_score(y_test, predictiongnb)*100)
print("")

[[86963 13477]
 [ 7015  6878]]
Akurasi dari Gradient Boost adalah: 82.08
Recall dari Gradient Boost adalah: 49.50694594400058
Precision dari Gradient Boost adalah: 33.790223532301646



In [52]:
prediction_test = gb_clf.predict(x_test)
prediction_train = gb_clf.predict(x_resample)
training_acc = accuracy_score(y_resample, prediction_train)
testing_acc = accuracy_score(y_test, prediction_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))
print(classification_report(y_test, prediction_test))

Training Accuracy: 0.849082459485224
Testing Accuracy: 0.82076915676139
              precision    recall  f1-score   support

           0       0.93      0.87      0.89    100440
           1       0.34      0.50      0.40     13893

    accuracy                           0.82    114333
   macro avg       0.63      0.68      0.65    114333
weighted avg       0.85      0.82      0.83    114333



In [53]:
model_gb = gb_clf.fit(x_resample, y_resample)

# Dump Model

In [54]:
filename = 'model_class.pkl'
pickle.dump(model_gb, open(filename, 'wb'))

# Create Prediction Automation

In [102]:
def to_predict(df):
    df2 = df.copy()
    onehotencoder = pickle.load(open('ohe_encoder.pkl', 'rb'))
    encoded_data = onehotencoder.transform(df2[['Gender', 'Vehicle_Age', 'Vehicle_Damage']])
    encoded_columns = onehotencoder.get_feature_names_out(['Gender', 'Vehicle_Age', 'Vehicle_Damage'])
    df2[encoded_columns] = encoded_data
    df2 = df2.drop(['id', 'Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Vintage'], axis=1)
    
    model_class = pickle.load(open('model_class.pkl', 'rb'))
    y_pred = model_class.predict(df2)
    df['prediction'] = y_pred
    return df

### Test Function Prediction

In [96]:
df = pd.read_csv("Homework - Prediction Insurance.csv")

In [97]:
df = df.drop('Response', axis=1)

In [103]:
to_predict(df)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,prediction
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1
1,2,Male,76,1,3,0,1-2 Year,No,33536,26,183,0
2,3,Male,47,1,28,0,> 2 Years,Yes,38294,26,27,1
3,4,Male,21,1,11,1,< 1 Year,No,28619,152,203,0
4,5,Female,29,1,41,1,< 1 Year,No,27496,152,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...
381104,381105,Male,74,1,26,1,1-2 Year,No,30170,26,88,0
381105,381106,Male,30,1,37,1,< 1 Year,No,40016,152,131,0
381106,381107,Male,21,1,30,1,< 1 Year,No,35118,160,161,0
381107,381108,Female,68,1,14,0,> 2 Years,Yes,44617,124,74,0
