# Predicting the Cost of Medical Insurance for Individuals

Using the Medical Cost dataset from Kaggle: https://www.kaggle.com/datasets/mirichoi0218/insurance and hosted on Github

In [None]:
#import dependencies
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load our data
insurance=pd.read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv")

In [None]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
insurance.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


#Turn our categorical variables into numbers using `get_dummies()` from pandas

In [None]:
insurance_one_hot=pd.get_dummies(insurance)
insurance_one_hot.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [None]:
#Split our data inot X and y
X=insurance_one_hot.drop("charges", axis=1)
y=insurance_one_hot["charges"]

In [None]:
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,0,0,1,0,0,0,1
1,18,33.77,1,0,1,1,0,0,0,1,0
2,28,33.0,3,0,1,1,0,0,0,1,0
3,33,22.705,0,0,1,1,0,0,1,0,0
4,32,28.88,0,0,1,1,0,0,1,0,0


In [None]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [None]:
X.shape, y.shape

((1338, 11), (1338,))

# Spit our data into train, test set using sklearn train_test_split()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 11), (268, 11), (1070,), (268,))

# Build and fit a model 

In [None]:
#create model
tf.random.set_seed(42)

#create model
insurance_model=tf.keras.Sequential([
    tf.keras.layers.Dense(1),
    tf.keras.layers.Dense(1)
])

#compile
insurance_model.compile(loss='mae',
                        optimizer=tf.keras.optimizers.SGD()
                        )

#fit
insurance_model.fit(X_train, y_train, epochs=100, verbose=0)

<keras.callbacks.History at 0x7f77384e8cd0>

In [None]:
#check the results
test_results={}

test_results['insurance_model']=insurance_model.evaluate(X_test, y_test)



The mae is quite large, lets try a bigger model, lets normalize the data using the following classes from Scikit-Learn:
- `make_column_transformer`: use to build a multistep data preprocessing function for the following transformation
 - `MinMaxScaler`- normalized the numerical columns
 - `OneHotEncoder`- for non-numerical columsn

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

#create column transformer to normalize our data
ct=make_column_transformer(
    (MinMaxScaler(), ['age','bmi','children']),
    (OneHotEncoder(handle_unknown="ignore"),['sex','smoker', 'region'])
)

# Create X & y
X = insurance.drop("charges", axis=1)
y = insurance["charges"]

# Build our train and test sets (use random state to ensure same split as before)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit column transformer on the training data(doing these on the test set will lead to data leakage)
ct.fit(X_train)

#Transform train and test set with normalization and one hot encoding
X_train_norm=ct.transform(X_train)
X_test_norm=ct.transform(X_test)

In [None]:
X_train.loc[0]


age                19
sex            female
bmi              27.9
children            0
smoker            yes
region      southwest
Name: 0, dtype: object

In [None]:
X_train_norm[0]

array([0.60869565, 0.10734463, 0.4       , 1.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        ])

In [None]:
X_train.shape, X_train_norm.shape #the normalize X_train has more columns because of the one hot encoded columns

((1070, 6), (1070, 11))

In [None]:
#create build a model with our normalized data
tf.random.set_seed(42)

#build the model
insurance_model_1= tf.keras.Sequential([
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(1)
])

#compile the model
insurance_model_1.compile(loss='mae',
                          optimizer=tf.keras.optimizers.Adam()
                          )

#fit
insurance_model_1.fit(X_train_norm, y_train, epochs=200, verbose=0)

<keras.callbacks.History at 0x7f773840c090>

In [None]:
insurance_model_1_mae=insurance_model_1.evaluate(X_test_norm,y_test)



In [None]:

test_results['insurance_model_1']=insurance_model_1.evaluate(X_test_norm, y_test)



In [None]:
test_results

{'insurance_model': 8628.2392578125, 'insurance_model_1': 3171.5771484375}

#let try to improve the model by add more layers, and increasing the number of epochs

In [None]:
#build
insurance_model_2=tf.keras.Sequential([
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(1)
])

#compile
insurance_model_2.compile(loss='mae',
                          optimizer=tf.keras.optimizers.Adam())

#fit
insurance_model_2.fit(X_train_norm, y_train, epochs=200, verbose=0)

<keras.callbacks.History at 0x7f77382cafd0>

In [None]:
test_results['insurance_model_2']=insurance_model_2.evaluate(X_test_norm, y_test)



# Performance

In [None]:
pd.DataFrame(test_results, index=['Mean absolute error']).T

Unnamed: 0,Mean absolute error
insurance_model,8628.239258
insurance_model_1,3171.577148
insurance_model_2,3176.32959


#Save Model

In [None]:
insurance_model_1.save('Insurance_Cost_Model')

INFO:tensorflow:Assets written to: Insurance_Cost_Model/assets


In [None]:
#reload to check
reloaded_model=tf.keras.models.load_model('Insurance_Cost_Model')
test_results['reloaded']=reloaded_model.evaluate(X_test_norm, y_test, verbose=0)

In [None]:
pd.DataFrame(test_results, index=['Mean absolute error']).T

Unnamed: 0,Mean absolute error
insurance_model,8628.239258
insurance_model_1,3171.577148
insurance_model_2,3176.32959
reloaded,3171.577148
