In [19]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [20]:
insurance_df = pd.read_csv('insurance.csv')

In [21]:
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [22]:
# checking for null inputs
insurance_df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [23]:
# categorical columns and features in the dataset
cat_cols = insurance_df.select_dtypes('object').columns.tolist()
print(cat_cols)

['sex', 'smoker', 'region']


In [24]:
# to convert the categorical features to numerical features
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(insurance_df[cat_cols])

In [25]:
# new column names for the numerically fitted categorical columns
encoded_cols = list(encoder.get_feature_names_out(cat_cols))
print(encoded_cols)

['sex_female', 'sex_male', 'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']


In [26]:
# adding the encoded columns in the dataframe
insurance_df[encoded_cols] = encoder.transform(insurance_df[cat_cols])
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,female,27.9,0,yes,southwest,16884.92,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,male,33.8,1,no,southeast,1725.55,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,male,33.0,3,no,southeast,4449.46,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,male,22.7,0,no,northwest,21984.47,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,32,male,28.9,0,no,northwest,3866.86,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1334,18,female,31.9,0,no,northeast,2205.98,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1335,18,female,36.9,0,no,southeast,1629.83,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1336,21,female,25.8,0,no,southwest,2007.95,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [27]:
# dropping original categorical columns from dataframe
insurance_df.drop(columns=['sex', 'smoker', 'region'], inplace=True)
insurance_df

Unnamed: 0,age,bmi,children,expenses,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.92,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,33.8,1,1725.55,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.46,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,22.7,0,21984.47,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,32,28.9,0,3866.86,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,31.0,3,10600.55,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1334,18,31.9,0,2205.98,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1335,18,36.9,0,1629.83,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1336,21,25.8,0,2007.95,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [28]:
# for every column we see that the scale is different
insurance_df.describe()

Unnamed: 0,age,bmi,children,expenses,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414,0.494768,0.505232,0.795217,0.204783,0.242152,0.2429,0.272048,0.2429
std,14.04996,6.098382,1.205493,12110.01124,0.50016,0.50016,0.403694,0.403694,0.428546,0.428995,0.445181,0.428995
min,18.0,16.0,0.0,1121.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.3,0.0,4740.2875,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,39.0,30.4,1.0,9382.03,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,34.7,2.0,16639.915,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,64.0,53.1,5.0,63770.43,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# dividing dataframe into input features (X) and target columns (y)
X = insurance_df.drop(columns='expenses')
y = insurance_df['expenses']

In [30]:
# input features
X

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,33.8,1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,33.0,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,22.7,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,32,28.9,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1333,50,31.0,3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1334,18,31.9,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1335,18,36.9,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1336,21,25.8,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [31]:
# target columns
y

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1338, dtype: float64

In [32]:
# bringing the scale of all the columns to a standard scale for data preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)

In [33]:
# applying transformation to get columns scaled between 0 to 1
X[:] = scaler.transform(X)
X

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.320755,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.000000,0.479784,0.2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.217391,0.458221,0.6,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.326087,0.180593,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.304348,0.347709,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1333,0.695652,0.404313,0.6,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1334,0.000000,0.428571,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1335,0.000000,0.563342,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1336,0.065217,0.264151,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [34]:
# splitting data into training and testing sets
# training:testing = 8:2
# random_state=42 means that we will get the same data for every iteration
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# checking shape of split data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 11), (268, 11), (1070,), (268,))

In [36]:
# using Linear Regression algorithm to train the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [37]:
# model makes predictions based on Linear Regression in y_pred
y_pred = model.predict(X_test)

In [38]:
# testing data (to which we compare the results of y_pred generated by the model)
y_test

764      9095.07
887      5272.18
890     29330.98
1293     9301.89
259     33750.29
          ...   
109     47055.53
575     12222.90
535      6067.13
543     63770.43
846      9872.70
Name: expenses, Length: 268, dtype: float64

In [39]:
# first 5 rows in y_pred
y_pred[:5]

array([ 8960.,  7040., 36352.,  9600., 26624.])

In [40]:
# calculating cost functions like Mean Absolute Erroe, Mean Squared Erroe and Root Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

4222.237425373134
33727741.89633321
5807.559030809176
