In [1]:
#importing required libraries

import pandas as pd
import numpy as np

import pandasql as psql

import matplotlib.pyplot as plt

#importing warnings 

import warnings
warnings.filterwarnings("ignore")

In [2]:
#loading the 'HealthIns' data set 

HealthIns=pd.read_csv(r"C:\Users\ARUN KUMAR\Downloads\archive\insurance.csv")
HealthIns.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
#checking the null values

HealthIns.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [4]:
#displaying the information of dataset

HealthIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
#correlation of variables

HealthIns.corr()

Unnamed: 0,age,bmi,children,expenses
age,1.0,0.109341,0.042469,0.299008
bmi,0.109341,1.0,0.012645,0.198576
children,0.042469,0.012645,1.0,0.067998
expenses,0.299008,0.198576,0.067998,1.0


In [6]:
#describing the variables

HealthIns.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [7]:
#displaying duplicate values

HealthIns[HealthIns.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
581,19,male,30.6,0,no,northwest,1639.56


In [8]:
#droping the duplicate values

HealthIns=HealthIns.drop_duplicates()

In [9]:
#initialising cols1 and cols2 variables 

cols1=['age','bmi']  #using normalisation
cols2=['sex', 'children', 'smoker', 'region']  #using dummies

In [10]:
#using dummies on cols2

HealthIns=pd.get_dummies(HealthIns,columns=cols2)

In [11]:
#displaying the values in transpose format

HealthIns.head().T

Unnamed: 0,0,1,2,3,4
age,19.0,18.0,28.0,33.0,32.0
bmi,27.9,33.8,33.0,22.7,28.9
expenses,16884.92,1725.55,4449.46,21984.47,3866.86
sex_female,1.0,0.0,0.0,0.0,0.0
sex_male,0.0,1.0,1.0,1.0,1.0
children_0,1.0,0.0,0.0,1.0,1.0
children_1,0.0,1.0,0.0,0.0,0.0
children_2,0.0,0.0,0.0,0.0,0.0
children_3,0.0,0.0,1.0,0.0,0.0
children_4,0.0,0.0,0.0,0.0,0.0


In [12]:
#identifying the dependent and independent variables

x= HealthIns.drop(columns='expenses')
y=HealthIns[['expenses']]

In [13]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size = 0.30,random_state=7)


In [14]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)

# LINEAR REGRESSION

In [15]:
# Train the algorithm and build the model with train dataset

from sklearn.linear_model import LinearRegression

modelREG = LinearRegression()
modelREG.fit(x_train, y_train)

y_pred=modelREG.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred), 3) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))

# Calculate Adjusted R squared values

r_squared = round(metrics.r2_score(y_test, y_pred),3)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),3)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 4324.047
Mean Squared Error (MSE): 38464491.055
Root Mean Squared Error (RMSE): 6201.975
Mean Absolute Percentage Error (MAPE): 42.8 %
R2_score: 0.733128
Adj R Square:  0.73


# DecisionTreeRegressor

In [16]:
# Build the model with RandomForestRegressor Regressor

from sklearn.tree import DecisionTreeRegressor

modelDT = DecisionTreeRegressor()
modelDT.fit(x_train,y_train)

# Predict the model with test dataset

y1_pred = modelDT.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y1_pred),3))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y1_pred),3))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y1_pred)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y1_pred), 3) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y1_pred),3))

# Calculate Adjusted R squared values

r_squared = round(metrics.r2_score(y_test, y1_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),3)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 3009.601
Mean Squared Error (MSE): 40220005.238
Root Mean Squared Error (RMSE): 6341.924
Mean Absolute Percentage Error (MAPE): 32.1 %
R2_score: 0.721
Adj R Square:  0.718


# RandomForestRegressor

In [17]:
# Build the model with RandomForestRegressor Regressor

from sklearn.ensemble import RandomForestRegressor

modelRF = RandomForestRegressor()
modelRF.fit(x_train,y_train)

# Predict the model with test dataset

y2_pred = modelRF.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y2_pred),3))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y2_pred),3))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y2_pred)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y2_pred), 3) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y2_pred),3))

# Calculate Adjusted R squared values

r_squared = round(metrics.r2_score(y_test, y2_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),3)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 2748.998
Mean Squared Error (MSE): 25883712.686
Root Mean Squared Error (RMSE): 5087.604
Mean Absolute Percentage Error (MAPE): 28.999999999999996 %
R2_score: 0.82
Adj R Square:  0.818


# GradientBoostingRegressor

In [18]:
# Build the model with Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor

modelGBR = GradientBoostingRegressor()
modelGBR.fit(x_train,y_train)

# Predict the model with test dataset

y3_pred = modelGBR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y3_pred),3))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y3_pred),3))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y3_pred)),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y3_pred), 3) * 100, '%')
print('R2_score:', round(metrics.r2_score(y_test, y3_pred),3))

# Calculate Adjusted R squared values

r_squared = round(metrics.r2_score(y_test, y3_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),3)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 2701.206
Mean Squared Error (MSE): 24800013.803
Root Mean Squared Error (RMSE): 4979.961
Mean Absolute Percentage Error (MAPE): 29.2 %
R2_score: 0.828
Adj R Square:  0.826
