# Medical Insurance Cost - Neural Network Regression

This project is the first step into a larger dataset for regression analysis. The purpose of this is to build out what an actual regression deep learning project might look like using actual data.

Going to look at the medical cost dataset from Kaggle:
* https://www.kaggle.com/datasets/mirichoi0218/insurance?r

## Setup and Define Helpers

1. Import all required files
2. Define all helper methods used

### Imports

In [None]:
import os
import random
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import plot_model

from src import utils

### Plot Helper Functions

In [None]:
def plot_actual_versus_predicted_data(y_true, y_predict):
    """ Plots the actual true values against the predicted values.
        Note that better predictions have a slope closer to 1.
    """
    plt.figure(figsize=(10, 7))
    
    plt.title('Actual Value vs. Predicted Value')
    plt.scatter(y_true, y_predict)

    
def plot_history(history):
    pd.DataFrame(history.history).plot()
    plt.ylabel('Loss')
    plt.xlabel('Epochs')

### Metrics Helpers

In [None]:
def mae(y_true, y_pred):
    return tf.keras.metrics.mean_absolute_error(tf.squeeze(y_true), tf.squeeze(y_pred))


def mse(y_true, y_pred):
    return tf.keras.metrics.mean_squared_error(tf.squeeze(y_true), tf.squeeze(y_pred))


def visualize_model(model):
    return plot_model(model, show_shapes=True, show_layer_names=True)

## Step-0: Get & Analyze the Data

Need to look at the data to see what needs to be done to build out a model.

In [None]:
# Reading the dataset from the raw csv file on the public github file
csv_dataset_url = 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv'
insurance = pd.read_csv(csv_dataset_url)

In [None]:
insurance.head()

In [None]:
# Looking at the spread of the number of children
insurance['children'].value_counts()

In [None]:
insurance['age'].plot(kind='hist')

In [None]:
insurance['bmi'].plot(kind='hist')

## Step-1: Preprocess the Data

1. Need to encode string variables to one hot encoder value.
2. Scaling features (normalizing, standarizing, etc.)

In [None]:
# Creating the Feature Scaler and One Hot Encoder
column_transformer = make_column_transformer(
    (MinMaxScaler(), ['age', 'bmi', 'children']),  # turn all value in these columns between 0 and 1
    (OneHotEncoder(handle_unknown='ignore'), ['sex', 'smoker', 'region']))

## Step-2: Splitting the Training & Test Data

In [None]:
# Creating a TensorFlow Tensor from the df
y_column_name = 'charges'
X_df = insurance.drop(y_column_name, axis=1)
y_df = insurance[y_column_name]

In [None]:
# Splitting the data to test and train
# NOTE: We do not need to convert these to tensors, Pandas is built on top of numpy which is handled directly
# with the model

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Fit the column transformer to our training data
column_transformer.fit(X_train)
X_train_normal = column_transformer.transform(X_train)
X_test_normal = column_transformer.transform(X_test)

In [None]:
# What does the data look like?
X_train_normal[0]

## Step-3: Creating and Visualizing the Model

I am going to create, compile, and fit a simple model as my first Trial to get a Baseline on where to start.

### Trial-1: No Hidden Layers, epochs=10

#### Creating, Compiling, & Fitting Model

In [None]:
# Need to figure out how many input variables there are for the Input layer
number_of_input_variables = X_train_normal.shape[-1]

# 1. Creating Model
model_1 = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(number_of_input_variables,), name='InputLayer'),
    tf.keras.layers.Dense(1, name='OutputLayer')
])

# 2. Compiling Model
model_1.compile(loss=tf.keras.losses.mae,
              optimizer=tf.keras.optimizers.legacy.SGD(),
              metrics=['mae'])

# 3. Fitting Model
history_1 = model_1.fit(X_train_normal, y_train, epochs=10)

#### Predicting Model

In [None]:
# Predicting
y_pred_1 = model_1.predict(X_test_normal)

#### Evaluating Model

In [None]:
model_1.summary()

In [None]:
visualize_model(model_1)

In [None]:
mae_1 = mae(y_test, y_pred_1)
mse_1 = mae(y_test, y_pred_1)

mae_1, mse_1

In [None]:
plot_actual_versus_predicted_data(y_test, y_pred_1)

In [None]:
plot_history(history_1)

### Trial-2: Hidden layer with 100 Neurons & Adam Optimizer

#### Creating, Compiling, & Fitting Model

In [None]:
# Need to figure out how many input variables there are for the Input layer
number_of_input_variables = X_train_normal.shape[-1]

# 1. Creating Model
model_2 = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(number_of_input_variables,), name='InputLayer'),
    tf.keras.layers.Dense(100, name='HiddenLayer-1'),
    tf.keras.layers.Dense(1, name='OutputLayer')
])

# 2. Compiling Model
model_2.compile(loss=tf.keras.losses.mae,
              optimizer=tf.keras.optimizers.legacy.Adam(lr=0.01),
              metrics=['mae'])

# 3. Fitting Model
history_2 = model_2.fit(X_train_normal, y_train, epochs=10)

#### Predicting Model

In [None]:
# Predicting
y_pred_2 = model_2.predict(X_test_normal)

#### Evaluating Model

In [None]:
model_2.summary()

In [None]:
visualize_model(model_2)

In [None]:
mae_2 = mae(y_test, y_pred_2)
mse_2 = mae(y_test, y_pred_2)

mae_2, mse_2

In [None]:
plot_actual_versus_predicted_data(y_test, y_pred_2)

In [None]:
plot_history(history_2)

### Trial-3: 2 Hidden layers with 100 Neurons & Adam Optimizer w/ 0.05 lr & Epochs=100

#### Creating, Compiling, & Fitting Model

In [None]:
# Need to figure out how many input variables there are for the Input layer
number_of_input_variables = X_train_normal.shape[-1]

# 1. Creating Model
model_3 = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(number_of_input_variables,), name='InputLayer'),
    tf.keras.layers.Dense(100, name='HiddenLayer-1'),
    tf.keras.layers.Dense(100, name='HiddenLayer-2'),
    tf.keras.layers.Dense(1, name='OutputLayer')
])

# 2. Compiling Model
model_3.compile(loss=tf.keras.losses.mae,
                optimizer=tf.keras.optimizers.legacy.Adam(lr=0.05),
                metrics=['mae'])

# 3. Fitting Model
history_3 = model_3.fit(X_train_normal, y_train, epochs=100)

#### Predicting Model

In [None]:
# Predicting
y_pred_3 = model_3.predict(X_test_normal)

#### Evaluating Model

In [None]:
model_3.summary()

In [None]:
visualize_model(model_3)

In [None]:
mae_3 = mae(y_test, y_pred_3)
mse_3 = mae(y_test, y_pred_3)

mae_3, mse_3

In [None]:
plot_actual_versus_predicted_data(y_test, y_pred_3)

In [None]:
plot_history(history_3)

### Trial-4: 2 Hidden layers with 100 Neurons & Adam Optimizer w/ 0.05 lr & Epochs=100 with Early Stop

#### Creating, Compiling, and Fitting

In [None]:
# 1. Create Model
model_4 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(11,)),
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(1)
])


# 2. Compile Model
model_4.compile(loss=tf.keras.losses.mae,
                optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.05),
                )