In [8]:
import numpy as np 
import pandas as pd 
import matplotlib as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

# pre-processing

In [9]:
ds = pd.read_csv('insurance.csv')
ds.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:
# numbers of rows and coloums 
ds.shape

(1338, 7)

In [11]:
# getting some information about the dataset 
ds.info()
ds.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


# clean the dataset 

In [12]:
# check the missing values 
ds.isnull().mean()

age         0.0
sex         0.0
bmi         0.0
children    0.0
smoker      0.0
region      0.0
charges     0.0
dtype: float64

# distribution of age,gender,smoker columns 

In [13]:
sns.set()

plt.figure(figsize=(5,5))
sns.histplot(ds['age'], kde=True)
plt.title('Age Distribution')
plt.tight_layout()
plt.show()

TypeError: 'module' object is not callable

In [14]:
# gender columns 
plt.figure(figsize=(5,5))
sns.countplot(x = 'sex', data=ds)
plt.title('Sex Destribution')
plt.tight_layout()
plt.show()

TypeError: 'module' object is not callable

In [15]:
ds['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [16]:
# bmi distribution
plt.figure(figsize=(5,5))
sns.distplot(ds['bmi'])
plt.title('BMI Distribution')
plt.show()

TypeError: 'module' object is not callable

In [17]:
# children column
plt.figure(figsize=(6,6))
sns.countplot(x='children', data=ds)
plt.title('Children')
plt.show()

TypeError: 'module' object is not callable

In [18]:
ds['children'].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [19]:
# smoker column
plt.figure(figsize=(5,5))
sns.countplot(x='smoker', data=ds)
plt.title('smoker')
plt.show()

TypeError: 'module' object is not callable

In [20]:
ds['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [21]:
# region column
plt.figure(figsize=(5,5))
sns.countplot(x='region', data=ds)
plt.title('region')
plt.show()

TypeError: 'module' object is not callable

In [22]:
ds['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [23]:
# distribution of charges value
plt.figure(figsize=(5,5))
sns.distplot(ds['charges'])
plt.title('Charges Distribution')
plt.show()

TypeError: 'module' object is not callable

# Encoding the categorical features

In [24]:
# encoding sex column
ds.replace({'sex':{'male':0,'female':1}}, inplace=True)

3 # encoding 'smoker' column
ds.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column
ds.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)

# Splitting the Features and Target

In [25]:
X = ds.drop(columns='charges', axis=1)
Y = ds['charges']

In [26]:
print(X)
print(Y)

      age  sex     bmi  children  smoker  region
0      19    1  27.900         0       0       1
1      18    0  33.770         1       1       0
2      28    0  33.000         3       1       0
3      33    0  22.705         0       1       3
4      32    0  28.880         0       1       3
...   ...  ...     ...       ...     ...     ...
1333   50    0  30.970         3       1       3
1334   18    1  31.920         0       1       2
1335   18    1  36.850         0       1       0
1336   21    1  25.800         0       1       1
1337   61    1  29.070         0       0       3

[1338 rows x 6 columns]
0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


# Splitting the data into Training data & Testing Data

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(1338, 6) (1070, 6) (268, 6)


# Model Training

Linear Regression

In [28]:
# loading the Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, Y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


# Model Evaluation

In [29]:
# prediction on training data
training_data_prediction =regressor.predict(X_train)

In [30]:
# R squared value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R squared vale : ', r2_train)

R squared vale :  0.751505643411174


In [31]:
# prediction on test data
test_data_prediction =regressor.predict(X_test)

In [32]:
# R squared value
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R squared vale : ', r2_test)

R squared vale :  0.7447273869684077


# Building a Predictive System

In [33]:
input_data = (31,1,25.74,0,1,0)

# changing input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = regressor.predict(input_data_reshaped)
print(prediction)

print('The insurance cost is USD ', prediction[0])

[3760.0805765]
The insurance cost is USD  3760.0805764960405
