Import Libraries

In [34]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import  mean_squared_error,mean_absolute_error,r2_score

import statsmodels.api as sm
from scipy.stats import shapiro,kstest,normaltest
import warnings
warnings.filterwarnings("ignore")
import pickle
import json

Problem Statement

To predict insurance charges 

Data Gathering

In [35]:
df=pd.read_csv("medical_insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


EDA(Exploratory data analysis)

In [36]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [38]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [40]:
df["sex"]

0       female
1         male
2         male
3         male
4         male
         ...  
1333      male
1334    female
1335    female
1336    female
1337    female
Name: sex, Length: 1338, dtype: object

In [41]:
df["sex"].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [42]:
df["sex"].replace({"female":0,"male":1},inplace=True)

In [43]:
df["sex"]

0       0
1       1
2       1
3       1
4       1
       ..
1333    1
1334    0
1335    0
1336    0
1337    0
Name: sex, Length: 1338, dtype: int64

In [44]:
df["smoker"]

0       yes
1        no
2        no
3        no
4        no
       ... 
1333     no
1334     no
1335     no
1336     no
1337    yes
Name: smoker, Length: 1338, dtype: object

In [45]:
df["smoker"].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [46]:
df["smoker"].replace({"no":1,"yes":0},inplace=True)


In [47]:
df["smoker"]

0       0
1       1
2       1
3       1
4       1
       ..
1333    1
1334    1
1335    1
1336    1
1337    0
Name: smoker, Length: 1338, dtype: int64

In [48]:
df["region"]

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object

In [49]:
df["region"].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [50]:
df["region"].value_counts().to_dict()

{'southeast': 364, 'southwest': 325, 'northwest': 325, 'northeast': 324}

In [52]:
df=pd.get_dummies(df,columns=["region"],dtype=int)
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,0,16884.92400,0,0,0,1
1,18,1,33.770,1,1,1725.55230,0,0,1,0
2,28,1,33.000,3,1,4449.46200,0,0,1,0
3,33,1,22.705,0,1,21984.47061,0,1,0,0
4,32,1,28.880,0,1,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,1,10600.54830,0,1,0,0
1334,18,0,31.920,0,1,2205.98080,1,0,0,0
1335,18,0,36.850,0,1,1629.83350,0,0,1,0
1336,21,0,25.800,0,1,2007.94500,0,0,0,1


In [53]:
df["children"].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   int32  
 7   region_northwest  1338 non-null   int32  
 8   region_southeast  1338 non-null   int32  
 9   region_southwest  1338 non-null   int32  
dtypes: float64(2), int32(4), int64(4)
memory usage: 83.8 KB


Train Test Split

In [55]:
df=df.select_dtypes(exclude=object)
x=df.drop("charges",axis=1)
y=df["charges"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=24)

In [56]:
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
132,53,0,35.900,2,1,0,0,0,1
508,24,0,25.270,0,1,1,0,0,0
422,40,1,32.775,1,0,1,0,0,0
613,34,0,19.000,3,1,1,0,0,0
1111,38,1,38.390,3,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
145,29,0,38.830,3,1,0,0,1,0
343,63,1,36.765,0,1,1,0,0,0
192,25,1,25.740,0,1,0,0,1,0
899,19,0,22.515,0,1,0,1,0,0


Model Training

In [57]:
model = LinearRegression()
model.fit(x_train, y_train)

Testing Data Evaluation

In [58]:
y_predicting_testing=model.predict(x_test)

In [59]:
mse=mean_squared_error(y_test,y_predicting_testing)
print("Mean squared error:",mse)
print("*"*80)
rmse=np.sqrt(mse)
print("Root Mean Squared Error:",rmse)
print("*"*80)
mae=mean_absolute_error(y_test,y_predicting_testing)
print("Mean absolute error:",mae)
print("*"*80)
r_squared=r2_score(y_test,y_predicting_testing)
print("R2 Score is :",r_squared)

Mean squared error: 34021111.95868059
********************************************************************************
Root Mean Squared Error: 5832.761949426755
********************************************************************************
Mean absolute error: 4316.782388102675
********************************************************************************
R2 Score is : 0.7765075980622729


Training Data Evaluation

In [60]:
y_predict_training=model.predict(x_train)

In [61]:
mse=mean_squared_error(y_train,y_predict_training)
print("Mean squared error:",mse)
print("*"*80)
rsme=np.sqrt(mse)
print("Root Mean Squared Error:",rsme)
print("*"*80)
mae=mean_absolute_error(y_train,y_predict_training)
print("Mean absolute error:",mae)
print("*"*80)
r_squared=r2_score(y_train,y_predict_training)
print("R2 score is :",r_squared)

Mean squared error: 37189122.64671699
********************************************************************************
Root Mean Squared Error: 6098.288501433578
********************************************************************************
Mean absolute error: 4220.386087354067
********************************************************************************
R2 score is : 0.7436753317330116


In [62]:
with open("model.pkl","wb") as f:
    pickle.dump(model,f)