#### Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

#### Data Collection and Analysis

In [2]:
medical_data = pd.read_csv("./medical_insurance_data.csv")

In [3]:
medical_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
medical_data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [5]:
medical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
medical_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [20]:
medical_data.shape

(1338, 7)

In [23]:
medical_data['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [21]:
medical_data['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

#### Encoding the Categorial data

In [24]:
medical_data.replace({'sex': {'female': 0, 'male': 1}, 'smoker': {'no': 0, 'yes': 1}, 'region': {'northeast': 0, 'southeast': 1, 'southwest': 2, 'northwest': 3}}, inplace=True)

In [25]:
medical_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,2,16884.92400
1,18,1,33.770,1,0,1,1725.55230
2,28,1,33.000,3,0,1,4449.46200
3,33,1,22.705,0,0,3,21984.47061
4,32,1,28.880,0,0,3,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,3,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,1,1629.83350
1336,21,0,25.800,0,0,2,2007.94500


#### Splitting data into Features and Target

In [26]:
X = medical_data.drop(columns='charges', axis=1)
Y = medical_data['charges']

In [27]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,2
1,18,1,33.770,1,0,1
2,28,1,33.000,3,0,1
3,33,1,22.705,0,0,3
4,32,1,28.880,0,0,3
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,3
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,1
1336,21,0,25.800,0,0,2


In [28]:
Y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

#### Splitting data into Train and Test data

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

#### Model Training

##### Linear Regression

In [31]:
# model = LinearRegression()

In [33]:
# model.fit(X_train, Y_train)

In [34]:
# Y_train_cap = model.predict(X_train)
# metrics.r2_score(Y_train, Y_train_cap)

0.7512845736176552

In [35]:
# Y_test_cap = model.predict(X_test)
# metrics.r2_score(Y_test, Y_test_cap)

0.742663590597133

##### Lasso Regression

In [36]:
model = Lasso()
model.fit(X_train, Y_train)

In [37]:
Y_train_cap = model.predict(X_train)
metrics.r2_score(Y_train, Y_train_cap)

0.7512844882962814

In [38]:
Y_test_cap = model.predict(X_test)
metrics.r2_score(Y_test, Y_test_cap)

0.7426701647665229

#### Building a Predictive System

In [44]:
input_data = (46,0,33.44,1,0,1)
arr = np.asarray(input_data)
arr_reshape = arr.reshape(-1, 1)
prediction = model.predict(arr_reshape.T)
print("Insurance cost is:", prediction[0])

Insurance cost is: 11000.685155984804
