Medical Insurance Cost Predictor

Submitted by :\
    Moukhik Gupta 102003277\
    Yashmeet Kaur 102003261\
    \
Submitted to :\
    Ms. Gurinderjeet Kaur

Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics

import plotly.figure_factory as ff
import plotly.express as px

Data Collection

In [2]:
# loading the data from csv file to a Pandas DataFrame
insurance_dataset = pd.read_csv('insurance.csv')

In [3]:
# first 5 rows of the dataframe
insurance_dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
# number of rows and columns
insurance_dataset.shape

(1338, 7)

In [5]:
# getting some informations about the dataset
insurance_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Categorical Features:
- Sex
- Smoker
- Region

In [6]:
# checking for missing values
insurance_dataset.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

Data Analysis & Visualization

In [7]:
# statistical Measures of the dataset
insurance_dataset.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


Correlation between columns

In [8]:
fig = px.imshow(insurance_dataset.corr(), text_auto=True, aspect="auto", color_continuous_scale='RdBu_r')
fig.show()

Age column

In [9]:
hist_data = [insurance_dataset.age]
group_labels = ['Age'] 

fig = ff.create_distplot(hist_data, group_labels, show_hist=True)
fig.update_traces(nbinsx=5, autobinx=True, selector={'type':'histogram'})
fig.update_layout(bargap=0.1)
fig.show()

Sex Column

In [10]:
sex_counts = insurance_dataset['sex'].value_counts()
df_sex = pd.DataFrame(sex_counts).reset_index()
df_sex = df_sex.rename(columns={"index": "sex", "sex": "count"})
df_sex

Unnamed: 0,sex,count
0,male,676
1,female,662


In [11]:
fig = px.pie(df_sex, values='count', names='sex', color='sex',
             color_discrete_map={'male':'lightblue','female':'pink'})
fig.show()

BMI column

In [12]:
fig = px.histogram(insurance_dataset, x="bmi", color='sex', marginal="box", nbins=50, text_auto=True, 
    color_discrete_map={'male':'lightblue','female':'pink'})
fig.update_layout(bargap=0.05)
fig.show()

Children column

In [13]:
fig = px.histogram(insurance_dataset, x="children", marginal="box", text_auto=True)
fig.update_layout(bargap=0.05)
fig.show()

Smoker column

In [14]:
smoker_counts = insurance_dataset['smoker'].value_counts()
df_smoker = pd.DataFrame(smoker_counts).reset_index()
df_smoker = df_smoker.rename(columns={"index": "smoker", "smoker": "count"})
df_smoker

Unnamed: 0,smoker,count
0,no,1064
1,yes,274


In [15]:
fig = px.pie(df_smoker, values='count', names='smoker', color='smoker',
             color_discrete_map={'no':'lightgreen','yes':'gray'})
fig.show()

Region Column

In [16]:
fig = px.histogram(insurance_dataset, x="region", marginal="box", text_auto=True)
fig.update_layout(bargap=0.1)
fig.show()

In [17]:
hist_data = [insurance_dataset.charges]
group_labels = ['Charges'] 

fig = ff.create_distplot(hist_data, group_labels, show_hist=True)
fig.update_traces(nbinsx=50, autobinx=True, selector={'type':'histogram'})
fig.update_layout( bargap=0.1)
fig.show()

Data Pre-Processing

Encoding the categorical features

In [18]:
# encoding sex column
insurance_dataset.replace({'sex':{'male':0,'female':1}}, inplace=True)

 # encoding 'smoker' column
insurance_dataset.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column
insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)

In [19]:
insurance_dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,0,1,16884.924
1,18,0,33.77,1,1,0,1725.5523
2,28,0,33.0,3,1,0,4449.462
3,33,0,22.705,0,1,3,21984.47061
4,32,0,28.88,0,1,3,3866.8552


Splitting the Features and Target

In [20]:
X = insurance_dataset.drop(columns='charges', axis=1)
Y = insurance_dataset['charges']

In [21]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.900,0,0,1
1,18,0,33.770,1,1,0
2,28,0,33.000,3,1,0
3,33,0,22.705,0,1,3
4,32,0,28.880,0,1,3
...,...,...,...,...,...,...
1333,50,0,30.970,3,1,3
1334,18,1,31.920,0,1,2
1335,18,1,36.850,0,1,0
1336,21,1,25.800,0,1,1


In [22]:
Y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

Splitting the data into Training data & Testing Data

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [24]:
print(X.shape, X_train.shape, X_test.shape)

(1338, 6) (1070, 6) (268, 6)


#Model Training



Linear Regression

In [25]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,Y_train)

In [26]:
# Model prediction on train data
y_train_pred = model.predict(X_train)

In [27]:
# Model Evaluation on training data
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print('R^2:', r2_score(y_train_pred,Y_train))
print('MAE:',  mean_absolute_error(y_train_pred,Y_train))
print('MSE:',  mean_squared_error(y_train_pred,Y_train))
print('RMSE:',  np.sqrt(mean_squared_error(y_train_pred,Y_train)))

R^2: 0.6510423548288894
MAE: 4214.89744476707
MSE: 37337214.4107756
RMSE: 6110.418513553356


In [28]:
# Model prediction on test data
y_test_pred = model.predict(X_test)


In [29]:
# Model Evaluation on testing data
print('R^2:', r2_score(y_test_pred,Y_test))
print('MAE:',  mean_absolute_error(y_test_pred,Y_test))
print('MSE:',  mean_squared_error(y_test_pred,Y_test))
print('RMSE:',  np.sqrt(mean_squared_error(y_test_pred,Y_test)))

R^2: 0.7071565853745314
MAE: 4190.220190137914
MSE: 33685623.354144424
RMSE: 5803.931715151758


DecisionTree Regressor

In [30]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train,Y_train)

In [31]:
# Model prediction on train data
y_train_pred = model.predict(X_train)

In [32]:
# Model Evaluation on training data

print('R^2:', r2_score(y_train_pred,Y_train))
print('MAE:',  mean_absolute_error(y_train_pred,Y_train))
print('MSE:',  mean_squared_error(y_train_pred,Y_train))
print('RMSE:',  np.sqrt(mean_squared_error(y_train_pred,Y_train)))

R^2: 0.9983049441232227
MAE: 29.572515327102803
MSE: 244239.5543823394
RMSE: 494.20598375812835


In [33]:
# Model prediction on test data
y_test_pred = model.predict(X_test)

In [34]:
# Model Evaluation on testing data

print('R^2:', r2_score(y_test_pred,Y_test))
print('MAE:',  mean_absolute_error(y_test_pred,Y_test))
print('MSE:',  mean_squared_error(y_test_pred,Y_test))
print('RMSE:',  np.sqrt(mean_squared_error(y_test_pred,Y_test)))

R^2: 0.727042468807588
MAE: 3351.05200830597
MSE: 47546481.35040563
RMSE: 6895.395663078779


Random Forest Resgressor

In [35]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train.values,Y_train)

In [36]:
# Model prediction on train data
y_train_pred = rf.predict(X_train)


X has feature names, but RandomForestRegressor was fitted without feature names



In [37]:
# Model Evaluation on training data

print('R^2:', r2_score(y_train_pred,Y_train))
print('MAE:',  mean_absolute_error(y_train_pred,Y_train))
print('MSE:',  mean_squared_error(y_train_pred,Y_train))
print('RMSE:',  np.sqrt(mean_squared_error(y_train_pred,Y_train)))

R^2: 0.9721620301371137
MAE: 1043.0780637650475
MSE: 3664791.8178596273
RMSE: 1914.364598988298


In [38]:
# Model prediction on test data
y_test_pred = rf.predict(X_test)


X has feature names, but RandomForestRegressor was fitted without feature names



In [39]:
# Model Evaluation on testing data

print('R^2:', r2_score(y_test_pred,Y_test))
print('MAE:',  mean_absolute_error(y_test_pred,Y_test))
print('MSE:',  mean_squared_error(y_test_pred,Y_test))
print('RMSE:',  np.sqrt(mean_squared_error(y_test_pred,Y_test)))

R^2: 0.8480341634166392
MAE: 2537.305354176618
MSE: 21671873.85770121
RMSE: 4655.305989696189


R^2 values\
Linear Regression - 0.7071565853745314\
DecisionTree Regressor - 0.7605394301918393\
RandomForest Regressor - 0.85656799470918

In [40]:
import pickle
pickle.dump(rf, open('model.pkl','wb'))