In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e7/sample_submission.csv
/kaggle/input/playground-series-s4e7/train.csv
/kaggle/input/playground-series-s4e7/test.csv


In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv")
print(train.shape)

# Exploratory Data Analysis

In [None]:
train_num = train[['Age','Annual_Premium','Vintage']]
train_cat = train[['Gender','Driving_License','Region_Code','Previously_Insured','Vehicle_Age','Vehicle_Damage','Policy_Sales_Channel']]

In [None]:
# Analyse %response by gender. Males appear to respond more positively.
gender_pivot = pd.pivot_table(train, index='Response', columns='Gender', values='id', aggfunc='count')
gender_pivot['Female%'] = gender_pivot['Female']/gender_pivot['Female'].sum()*100
gender_pivot['Male%'] = gender_pivot['Male']/gender_pivot['Male'].sum()*100
gender_pivot

In [None]:
# Plot histogram by age
plt.hist(train['Age'], bins=200);

In [None]:
# Normalising histogram by age
plt.hist(np.log(train['Age']), bins=200);

In [None]:
# Analyse and visualise %response by age. Response appears to peak around 30s, when people are more likely to own/drive a car.
age_pivot = pd.pivot_table(train, index='Response', columns='Age', values='id', aggfunc='count').T
age_pivot['Response%'] = age_pivot[1]/(age_pivot[0]+age_pivot[1])*100
age_pivot
sns.barplot(x=age_pivot.index, y=age_pivot['Response%']).set_ylabel('Positive Response %')
plt.show()

In [None]:
# Analyse %response of those with(out) driving license. Those with license are more likely to require insurance.
pd.pivot_table(train, index='Response', columns='Driving_License', values='id', aggfunc='count')

In [None]:
# Plotting distribution by region code
region_bar = sns.barplot(x=train['Region_Code'].value_counts().index, y=train['Region_Code'].value_counts()/11504798*100)
region_bar.set_title('Region Code')
region_bar.set_ylabel('% of total')
plt.show()

In [None]:
# Analysing positive response % by region
region_pivot = pd.pivot_table(train, index='Response', columns='Region_Code', values='id', aggfunc='count').T
region_pivot['Response%'] = region_pivot[1]/(region_pivot[0]+region_pivot[1])*100
sns.barplot(x=region_pivot.index, y=region_pivot['Response%']).set_ylabel('Positive Response %')
plt.show()

In [None]:
# Analyse %response of those (not) previously insured. Those who never bought insurance before are more likely to require insurance.
pd.pivot_table(train, index='Response', columns='Previously_Insured', values='id', aggfunc='count')

In [None]:
# Analyse %response by vehicle age. Respondents with older vehicles more likely to require insurance.
vehicle_age_pivot = pd.pivot_table(train, index='Response', columns='Vehicle_Age', values='id', aggfunc='count')
vehicle_age_pivot = vehicle_age_pivot[['< 1 Year','1-2 Year','> 2 Years']]
vehicle_age_pivot

In [None]:
# Analyse %response by vehicle damage. Those who have damaged vehicles are more likely to require insurance.
pd.pivot_table(train, index='Response', columns='Vehicle_Damage', values='id', aggfunc='count')

In [None]:
train['Annual_Premium'].value_counts()/11504798*100

In [None]:
# Analyse distribution of annual premiums.
plt.hist(train['Annual_Premium'], bins=500);

In [None]:
# Extracting outlier rows with Annual Premiums exceeding $150,000
train.loc[train['Annual_Premium'] > 150000]

In [None]:
# Applying log transformation
plt.hist(np.log(train['Annual_Premium']), bins=500);

In [None]:
train['Policy_Sales_Channel'].value_counts()/11504798*100

In [None]:
# Plotting distribution by policy sales channel
channel_bar = sns.barplot(x=train['Policy_Sales_Channel'].value_counts().index, y=train['Policy_Sales_Channel'].value_counts()/11504798*100)
channel_bar.set_title('Policy_Sales_Channel')
channel_bar.set_ylabel('% of total')
plt.show()

In [None]:
# Analysing positive response % by sales channel
channel_pivot = pd.pivot_table(train, index='Response', columns='Policy_Sales_Channel', values='id', aggfunc='count').T
channel_pivot['Response%'] = channel_pivot[1]/(channel_pivot[0]+channel_pivot[1])*100
sns.barplot(x=channel_pivot.index, y=channel_pivot['Response%']).set_ylabel('Positive Response %')
plt.show()

In [None]:
channels = train['Policy_Sales_Channel'].value_counts()
channels
main_channels = channels.head(16)
main_channels
main_channels.index

In [None]:
# Plotting histogram distribution by customer vintage
plt.hist(train['Vintage'], bins=500);

In [None]:
# Analysing for any correlation between Vintage and positive response rate
vintage_pivot = pd.pivot_table(train, index='Response', columns='Vintage', values='id', aggfunc='count').T
vintage_pivot['Response%'] = vintage_pivot[1]/(vintage_pivot[0]+vintage_pivot[1])*100
sns.scatterplot(x=vintage_pivot.index, y=vintage_pivot['Response%']).set_ylabel('Postive Response %')
plt.show()

# Insights gained and observations made
**1. Gender**
* Males have a slightly higher rate of positive response (Males: 13.97% vs Females: 10.33%). 

**2. Age**
* Age distribution displays right-skewing of the data.
* Positive response rate is highest between age 30 and 72 (middle-aged customers). This suggests that this age group should be the target focus of the insurance company.

**3. Driving License**
* Most of the data is obtained from those possessing a driving license.
* Additionally, the positive response rate is much higher with customers possessing a driving license, which is to be expected.

**4. Region Code**
* Distribution of region codes indicate that most of the data has been obtained from a single region (Region 28 represents 30% of the data)

**5. Previously Insured**
* Those who never bought insurance before are significantly more likely to require insurance.

**6. Vehicle Age**
* Most of the data is obtained from those possessing vehicles aged 2-years or less.
* Can be observed that the older the vehicle possessed by the customer, the more likely the customer requires insurance.

**7. Vehicle Damage**
* Can be observed that customers with damaged vehicles are more likely to require insurance.

**8. Annual Premium**
* Annual Premium distribution displays right-skewing of the data, with 18.36% of customers paying 2,630 a year.
* Suggests that most customers pay low premiums, while a tiny proportion of outlier customers pay >100,000 worth of annual premiums.

**9. Policy Sales Channel**
* Distribution shows that most of the data has been obtained from a handful of sales channels (Channel 152, 26 and 124 make up 77% of the data).

**10. Vintage**
* Distribution by vintage appears relatively even, which shows that the customers have been with the insurance company for various lengths of time.
* Appears to show almost no correlation with Response.

# Model

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

train_copy = train.copy()

# Using ColumnTransformer to apply ordinal encoding and feature scaling to the respective categorical and numeric columns
preprocessor = ColumnTransformer(transformers=[
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['Gender','Region_Code','Vehicle_Age','Vehicle_Damage','Policy_Sales_Channel']),
    ('num', StandardScaler(), ['Age','Annual_Premium'])
], remainder = 'passthrough')

# Setting up pipeline
pipeline = Pipeline(steps=[
    ('pre', preprocessor)
])

y = train_copy['Response']
X = train_copy.drop(columns=['Response', 'id'])
X_preprocessed = pipeline.fit_transform(X)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Using Decision Tree and XGBoost classifier models. Random Forest took too long to train as it does not scale well with large datasets like this.
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Using GridSearchCV to perform hyperparameter tuning and reduce over-fitting.
param_grids = {
    'DecisionTree': {
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'max_depth': [10, 15, 20],
        'min_child_weight': [10, 15, 20],
        'gamma': [2, 4, 6]
    }
}

# 2-fold cross-validation
cv = KFold(n_splits=2, shuffle=True, random_state=42)

# Training prediction and evaluation using ROC AUC
grids = {}
for model_name, model in models.items():
    grids[model_name] = GridSearchCV(estimator=model,
                                    param_grid=param_grids[model_name],
                                    cv=cv,
                                    scoring='roc_auc',
                                    n_jobs=-1,
                                    verbose=2)
    grids[model_name].fit(X_train, y_train)
    best_params = grids[model_name].best_params_
    best_score = grids[model_name].best_score_

    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best accuracy for {model_name}: {best_score}\n')

# Submission

In [None]:
test = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv")
test_id = test['id']
test = test.drop(columns=['id'])

In [None]:
# Transforming test data
test_preprocessed = pipeline.transform(test)
print(X_preprocessed.shape)
print(test_preprocessed.shape)

In [None]:
# Prediction and submission using better-performing model
model = XGBClassifier(max_depth=10, min_child_weight=20, gamma=2).fit(X_train, y_train)
y_pred = model.predict_proba(test_preprocessed)[:,1]
output = pd.DataFrame({'id': test_id, 'Response': y_pred})
output.to_csv('submission.csv', index=False)