# **Insurance Forecasting**

## **importing libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

## **Loading the Dataset**

In [2]:
data=pd.read_csv("data/insurance.csv")

In [3]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
data.shape

(1338, 7)

In [5]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## **Data Preprocessing**

### **Data Cleaning**

In [7]:
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
data["sex"].unique()

array(['female', 'male'], dtype=object)

In [9]:
data["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [10]:
data["region"].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()

In [13]:
data["sex"]=le.fit_transform(data["sex"])

In [14]:
for column in data.select_dtypes(include=["object"]).columns:
    data[column]=le.fit_transform(data[column])

In [15]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


## **Splitting the dataset**

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X = data.drop(["charges"],axis=1)
y = data["charges"]

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

## **Model Building**

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
rf_model = RandomForestRegressor()

In [21]:
rf_model.fit(X_train,y_train)


In [22]:
y_pred = rf_model.predict(X_test)

## **Model Validation**

In [23]:
from sklearn.metrics import r2_score, mean_squared_error

In [24]:
r2_score(y_test,y_pred)

0.8498344195843227

In [25]:
mean_squared_error(y_pred,y_test)

22658655.573318988

## **Single Prediction**

In [26]:
input = [[61,0,29.070	,0	,1	,1	]]
charg=rf_model.predict(input)

In [27]:
charg

array([28917.2354385])

## **Saving the model**

In [28]:
import joblib

In [29]:
joblib.dump(rf_model,"insurance_model.pkl")

['insurance_model.pkl']