In [2]:
import numpy as np 
import pandas as pd 
import matplotlib as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

# pre-processing

In [3]:
ds = pd.read_csv('insurance.csv')
ds.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
# numbers of rows and coloums 
ds.shape

(1338, 7)

In [5]:
# getting some information about the dataset 
ds.info()
ds.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


# clean the dataset 

In [6]:
# check the missing values 
ds.isnull().mean()

age         0.0
sex         0.0
bmi         0.0
children    0.0
smoker      0.0
region      0.0
charges     0.0
dtype: float64

# distribution of age,gender,smoker columns 

In [7]:
sns.set()

plt.figure(figsize=(5,5))
sns.histplot(ds['age'], kde=True)
plt.title('Age Distribution')
plt.tight_layout()
plt.show()

TypeError: 'module' object is not callable

In [None]:
# gender columns 
plt.figure(figsize=(5,5))
sns.countplot(x = 'sex', data=ds)
plt.title('Sex Destribution')
plt.tight_layout()
plt.show()

In [None]:
ds['sex'].value_counts()

In [None]:
# bmi distribution
plt.figure(figsize=(5,5))
sns.distplot(ds['bmi'])
plt.title('BMI Distribution')
plt.show()

In [None]:
# children column
plt.figure(figsize=(6,6))
sns.countplot(x='children', data=ds)
plt.title('Children')
plt.show()

In [None]:
ds['children'].value_counts()

In [None]:
# smoker column
plt.figure(figsize=(5,5))
sns.countplot(x='smoker', data=ds)
plt.title('smoker')
plt.show()

In [None]:
ds['smoker'].value_counts()

In [None]:
# region column
plt.figure(figsize=(5,5))
sns.countplot(x='region', data=ds)
plt.title('region')
plt.show()

In [None]:
ds['region'].value_counts()

In [None]:
# distribution of charges value
plt.figure(figsize=(5,5))
sns.distplot(ds['charges'])
plt.title('Charges Distribution')
plt.show()

# Encoding the categorical features

In [None]:
# encoding sex column
ds.replace({'sex':{'male':0,'female':1}}, inplace=True)

3 # encoding 'smoker' column
ds.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column
ds.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)

# Splitting the Features and Target

In [None]:
X = ds.drop(columns='charges', axis=1)
Y = ds['charges']

In [None]:
print(X)
print(Y)

# Splitting the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

# Model Training

Linear Regression

In [None]:
# loading the Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, Y_train)


# Model Evaluation

In [None]:
# prediction on training data
training_data_prediction =regressor.predict(X_train)

In [None]:
# R squared value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R squared vale : ', r2_train)

In [None]:
# prediction on test data
test_data_prediction =regressor.predict(X_test)

In [None]:
# R squared value
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R squared vale : ', r2_test)

# Building a Predictive System

In [None]:
input_data = (31,1,25.74,0,1,0)

# changing input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = regressor.predict(input_data_reshaped)
print(prediction)

print('The insurance cost is USD ', prediction[0])