## Importing Dependencies

In [1]:
# Importing the Necessary libraries

import numpy as np
import pandas as pd

In [2]:
# Reading the csv file as DataFrame

data = pd.read_csv("/content/Health_insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
# Checking for null values

data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [12]:
# Looking at the correlation betweent the data

print(data.corr())

               age       sex       bmi  children    smoker   charges
age       1.000000 -0.020856  0.109272  0.042469 -0.025019  0.299008
sex      -0.020856  1.000000  0.046371  0.017163  0.076185  0.057292
bmi       0.109272  0.046371  1.000000  0.012759  0.003750  0.198341
children  0.042469  0.017163  0.012759  1.000000  0.007673  0.067998
smoker   -0.025019  0.076185  0.003750  0.007673  1.000000  0.787251
charges   0.299008  0.057292  0.198341  0.067998  0.787251  1.000000


In [5]:
# Checking for the "smoker" column 

import plotly.express as px
figure = px.histogram(data, x="sex", color="smoker", title="Number of Smokers")
figure.show()

In [9]:
# Checking for the value count of smoker

data["smoker"].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [10]:
# Transforming string values to numerical values

data["sex"] = data["sex"].map({"female":0, "male":1})
data["smoker"] = data["smoker"].map({"no":0, "yes":1})
print(data.head())

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [11]:
# Checking the distribution of regions 

import plotly.express as px
pie = data["region"].value_counts()
regions = pie.index
population = pie.values
fig = px.pie(data, values=population, names=regions)
fig.show()

# Health Insurance Premium Prediction Model

In [14]:
X = np.array(data[["age", "sex", "bmi", "smoker"]])
y = np.array(data["charges"])

In [15]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [16]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
forest.fit(Xtrain, ytrain)

RandomForestRegressor()

In [24]:
print("Train score", forest.score(Xtrain, ytrain))
print("Test score", forest.score(Xtest, ytest))

Train score 0.9721921726852732
Test score 0.8572507621564573


In [22]:
ypred = forest.predict(Xtest)
data = pd.DataFrame(data={"Predicted Premium Amount": ypred})
print(data.head())

   Predicted Premium Amount
0               8630.123759
1               7619.980475
2               1810.521680
3              42263.016162
4              44426.302052


## Pickle

In [25]:
import pickle

model = pickle.dump(forest, open("Model_Health_insurance.pkl", "wb"))

In [26]:
model = pickle.load(open("Model_Health_insurance.pkl", "rb"))

In [27]:
model.predict([[24, 1, 30, 0]])

array([2968.1531402])