## ⚕️ Medical Expense Prediction

Given *medical data about various patients*, let's try to predict the **expenses** for a given patient.

We will use various regression models to make our predictions.

Data source: https://www.kaggle.com/datasets/noordeen/insurance-premium-prediction

### Importing Libraries

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action = 'ignore')

In [3]:
data = pd.read_csv('insurance.csv')
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Preprocessing

In [5]:
df = data.copy()

In [6]:
{column: df[column].unique() for column in df.select_dtypes('object')}

{'sex': array(['female', 'male'], dtype=object),
 'smoker': array(['yes', 'no'], dtype=object),
 'region': array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)}

In [7]:
pd.get_dummies(df['sex']).corr() # why one hot encoding does not work on columns with binary values

Unnamed: 0,female,male
female,1.0,-1.0
male,-1.0,1.0


In [9]:
# Binary encoding
df['smoker'] = df['smoker'].replace({'no': 0, 'yes': 1})
df['sex'] = df['sex'].replace({'female': 0, 'male': 1})

In [10]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,southwest,16884.92
1,18,1,33.8,1,0,southeast,1725.55
2,28,1,33.0,3,0,southeast,4449.46
3,33,1,22.7,0,0,northwest,21984.47
4,32,1,28.9,0,0,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,northwest,10600.55
1334,18,0,31.9,0,0,northeast,2205.98
1335,18,0,36.9,0,0,southeast,1629.83
1336,21,0,25.8,0,0,southwest,2007.95


In [12]:
# one-hot encoding region column
region_dummies = pd.get_dummies(df['region'], prefix='region', dtype=int)
region_dummies

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [13]:
df = pd.concat([df, region_dummies], axis=1)
df = df.drop('region', axis=1)
df

Unnamed: 0,age,sex,bmi,children,smoker,expenses,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.92,0,0,0,1
1,18,1,33.8,1,0,1725.55,0,0,1,0
2,28,1,33.0,3,0,4449.46,0,0,1,0
3,33,1,22.7,0,0,21984.47,0,1,0,0
4,32,1,28.9,0,0,3866.86,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,10600.55,0,1,0,0
1334,18,0,31.9,0,0,2205.98,1,0,0,0
1335,18,0,36.9,0,0,1629.83,0,0,1,0
1336,21,0,25.8,0,0,2007.95,0,0,0,1


In [14]:
# Split df into X and y
y = df['expenses']
X = df.drop('expenses', axis=1)

In [15]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,0,0,0,1
1,18,1,33.8,1,0,0,0,1,0
2,28,1,33.0,3,0,0,0,1,0
3,33,1,22.7,0,0,0,1,0,0
4,32,1,28.9,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,0,1,0,0
1334,18,0,31.9,0,0,1,0,0,0
1335,18,0,36.9,0,0,0,0,1,0
1336,21,0,25.8,0,0,0,0,0,1


In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [17]:
X_train.shape, X_test.shape

((936, 9), (402, 9))

In [18]:
X_train.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
count,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0
mean,38.82265,0.511752,30.684829,1.104701,0.206197,0.240385,0.238248,0.264957,0.25641
std,14.029097,0.500129,6.087874,1.222664,0.40479,0.427545,0.426239,0.441546,0.436884
min,18.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,26.0,0.0,26.275,0.0,0.0,0.0,0.0,0.0,0.0
50%,38.0,1.0,30.5,1.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,1.0,34.725,2.0,0.0,0.0,0.0,1.0,1.0
max,64.0,1.0,53.1,5.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [20]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
744,0.797152,0.976766,-0.704206,-0.904002,-0.509664,-0.562544,1.788102,-0.600387,-0.587220
363,-1.271085,-1.023787,-0.704206,-0.085679,-0.509664,-0.562544,-0.559252,-0.600387,1.702939
10,-0.985811,0.976766,-0.737076,-0.904002,-0.509664,1.777639,-0.559252,-0.600387,-0.587220
970,0.797152,-1.023787,-0.408379,1.550967,-0.509664,-0.562544,-0.559252,1.665591,-0.587220
634,0.868471,0.976766,1.481632,-0.085679,-0.509664,-0.562544,-0.559252,-0.600387,1.702939
...,...,...,...,...,...,...,...,...,...
715,1.510338,0.976766,-0.293334,-0.904002,-0.509664,-0.562544,-0.559252,-0.600387,1.702939
905,-0.914493,-1.023787,-0.211160,0.732644,-0.509664,1.777639,-0.559252,-0.600387,-0.587220
1096,0.868471,-1.023787,0.709193,0.732644,1.962076,1.777639,-0.559252,-0.600387,-0.587220
235,0.083967,-1.023787,-1.394471,0.732644,1.962076,-0.562544,-0.559252,1.665591,-0.587220


In [21]:
X_train.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
count,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0
mean,-1.081756e-16,7.591269000000001e-17,4.175198e-17,-2.087599e-17,9.678867000000001e-17,-1.043799e-16,-1.8978170000000002e-17,6.832142000000001e-17,-9.489086e-19
std,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535
min,-1.485041,-1.023787,-2.413434,-0.9040023,-0.5096643,-0.562544,-0.5592522,-0.6003875,-0.5872202
25%,-0.9144925,-1.023787,-0.72475,-0.9040023,-0.5096643,-0.562544,-0.5592522,-0.6003875,-0.5872202
50%,-0.05867016,0.9767656,-0.03037643,-0.08567913,-0.5096643,-0.562544,-0.5592522,-0.6003875,-0.5872202
75%,0.8684707,0.9767656,0.6639971,0.732644,-0.5096643,-0.562544,-0.5592522,1.665591,1.702939
max,1.795612,0.9767656,3.683906,3.187613,1.962076,1.777639,1.788102,1.665591,1.702939


### Training

In [22]:
models = {
    '                     Linear Regression': LinearRegression(),
    "                   K Nearest Neighbors": KNeighborsRegressor(),
    "                       Neural Networks": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

In [23]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
                   K Nearest Neighbors trained.
                       Neural Networks trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


### Results

In [25]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

                     Linear Regression R^2 Score: 0.74060
                   K Nearest Neighbors R^2 Score: 0.78952
                       Neural Networks R^2 Score: -1.15401
Support Vector Machine (Linear Kernel) R^2 Score: -1.07099
   Support Vector Machine (RBF Kernel) R^2 Score: -0.11296
                         Decision Tree R^2 Score: 0.73392
                         Random Forest R^2 Score: 0.82960
                     Gradient Boosting R^2 Score: 0.86153
