<a href="https://colab.research.google.com/github/binhluong84/Machine-Learning/blob/main/Health_Insurance_Interest_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
#Connect Google Drive
from google.colab import drive
drive.mount('/gdrive')


In [None]:
train_df = pd.read_csv('/gdrive/MyDrive/Health Insurance Cross Sell Prediction/train.csv.zip')
test_df = pd.read_csv('/gdrive/MyDrive/Health Insurance Cross Sell Prediction/test.csv.zip')
sample_submission = pd.read_csv('/gdrive/MyDrive/Health Insurance Cross Sell Prediction/sample_submission.csv.zip')

In [None]:
train_df

In [None]:
test_df

In [None]:
sample_submission

In [None]:
train_df.info()

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
train_df

In [None]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [None]:
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

get_uniques(train_df, categorical_features)

In [None]:
binary_features = ['Gender', 'Vehicle_Damage']

ordinal_features = ['Vehicle_Age']

In [None]:
def binary_encode(df, column, positive_label):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [None]:
train_df

In [None]:
train_df = binary_encode(train_df, 'Gender', 'Male')
test_df = binary_encode(test_df, 'Gender', 'Male')

train_df = binary_encode(train_df, 'Vehicle_Damage', 'Yes')
test_df = binary_encode(test_df, 'Vehicle_Damage', 'Yes')

In [None]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [None]:
age_ordering = ['< 1 Year', '1-2 Year', '> 2 Years']

train_df = ordinal_encode(train_df, 'Vehicle_Age', age_ordering)
test_df = ordinal_encode(test_df, 'Vehicle_Age', age_ordering)

In [None]:
train_df

In [None]:
test_df

In [None]:
test_ids = test_df['id'].tolist()

train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

In [None]:
y = train_df['Response']
X = train_df.drop('Response', axis=1)

In [None]:
X.plot(kind='box', figsize=(20, 10), logy=True)

In [None]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)
test_df = scaler.fit_transform(test_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
y.sum() / len(y)

In [None]:
inputs = tf.keras.Input(shape=(10,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')]
)


batch_size = 64
epochs = 25

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau()],
    verbose=0
)

In [None]:
fig = px.line(
    history.history, y=['loss', 'val_loss'], 
    labels={'index': 'Epoch', 'value': 'Loss'}, 
    title='Training History')
fig.show()

In [None]:
model.evaluate(X_test, y_test)

In [None]:
sample_submission

In [None]:
preds = model.predict(test_df)

In [None]:
preds = list(map(lambda x: np.int(x[0]),  preds >= 0.5))

In [None]:
submission = pd.concat([pd.Series(test_ids), pd.Series(preds)], axis=1)
submission.columns = ['id', 'Response']

In [None]:
submission

In [None]:
sample_submission.shape == submission.shape

In [None]:
submission.to_csv('./submission.csv')