In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("medical_insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 73.3+ KB


In [3]:
df['sex'].replace({'female':0,'male':1},inplace=True)

In [4]:
df['smoker'].replace({'no':1,'yes':0},inplace=True)

In [6]:
df = pd.get_dummies(df,columns=['region'])
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,0,16884.92400,0,0,0,1
1,18,1,33.770,1,1,1725.55230,0,0,1,0
2,28,1,33.000,3,1,4449.46200,0,0,1,0
3,33,1,22.705,0,1,21984.47061,0,1,0,0
4,32,1,28.880,0,1,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,1,10600.54830,0,1,0,0
1334,18,0,31.920,0,1,2205.98080,1,0,0,0
1335,18,0,36.850,0,1,1629.83350,0,0,1,0
1336,21,0,25.800,0,1,2007.94500,0,0,0,1


In [7]:
x = df.drop('charges',axis=1)
y = df['charges']

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)

In [9]:
model = LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [10]:
#Testing data Evaluation 

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test,y_pred)
print("MSE is : ",mse)

rmse = np.sqrt(mse)
print("RMSE is : ",rmse)

mae = mean_absolute_error(y_test,y_pred)
print("MAE is : ",mae)

r2 = r2_score(y_test,y_pred)
print("R2 score is : ",r2)

MSE is :  39737837.0602452
RMSE is :  6303.795448794734
MAE is :  4358.280567455451
R2 score is :  0.7341854344194202


In [11]:
# Traing Data Evaluation

y_pred_train = model.predict(x_train)

mse = mean_squared_error(y_train,y_pred_train)
print("MSE is : ",mse)

rmse = np.sqrt(mse)
print("RMSE is : ",rmse)

mae = mean_absolute_error(y_train,y_pred_train)
print("MAE is : ",mae)

r2 = r2_score(y_train,y_pred_train)
print("R2 score is : ",r2)

MSE is :  35738908.17587644
RMSE is :  5978.202754664351
MAE is :  4103.694591054445
R2 score is :  0.7548625668706685


# Testing Data

In [18]:
age = 26
sex = 'female'
bmi = 26.5
children = 3
smoker = 'no'
region = 'southwest'

#charges = ?

In [21]:
column_names = x.columns
column_names

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [14]:
test_array = np.zeros(len(column_names))
test_array

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

# LABEL ENCODING 

In [15]:
label_encoded_data = {"sex":{'female':0,'male':1},"smoker":{'no':1,'yes':0}}

In [19]:
region = "region_" + region
region

'region_southwest'

In [24]:
region_index = np.where(column_names== region)[0][0]
region_index

8

In [25]:
test_array[0] = age
test_array[1] = label_encoded_data["sex"][sex]
test_array[2] = bmi
test_array[3] = children
test_array[4] = label_encoded_data["smoker"][smoker]
test_array[region_index]=1

In [26]:
test_array

array([26. ,  0. , 26.5,  3. ,  1. ,  0. ,  0. ,  0. ,  1. ])

In [27]:
model.predict([test_array])



array([4500.88888248])