## Medical Cost Prediction

Given *patient data*, let's try to predict the **charges** a given patient will incur.

We will use a variety of linear regression models to make our predictions.

Data source: https://www.kaggle.com/datasets/mirichoi0218/insurance

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV

In [2]:
data = pd.read_csv('insurance.csv')
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Preprocessing

In [4]:
print("Total missing values: ", data.isna().sum().sum())

Total missing values:  0


In [5]:
data['children'] = data['children'].astype(str)

In [6]:
print("Total non-numeric columns: ", len(data.select_dtypes('object').columns))

Total non-numeric columns:  4


In [7]:
{column: data[column].unique() for column in data.select_dtypes('object').columns}

{'sex': array(['female', 'male'], dtype=object),
 'children': array(['0', '1', '3', '2', '5', '4'], dtype=object),
 'smoker': array(['yes', 'no'], dtype=object),
 'region': array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)}

In [8]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [9]:
def preprocess_inputs(df, scaler, train_size=0.7):

    df = df.copy()
    
    # Binary encode sex and smoker columns
    df = binary_encode(df, 'sex', 'male')
    df = binary_encode(df, 'smoker', 'yes')

    # One-hot encode the children and region columns
    df = onehot_encode(df, 'children', 'ch')
    df = onehot_encode(df, 'region', 're')

    # Split df in X and y
    y = df['charges'].copy()
    X = df.drop('charges', axis=1).copy()

    # Scale X with the given scaler
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=123)

    return X_train, X_test, y_train, y_test

In [10]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [30]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, StandardScaler())

In [31]:
X_train

Unnamed: 0,age,sex,bmi,smoker,ch_0,ch_1,ch_2,ch_3,ch_4,ch_5,re_northeast,re_northwest,re_southeast,re_southwest
300,-0.228344,0.989591,-0.510736,-0.507463,-0.866781,-0.565267,-0.467525,2.742680,-0.137987,-0.116775,1.769076,-0.566418,-0.611324,-0.566418
904,1.480485,-1.010519,0.727800,-0.507463,1.153694,-0.565267,-0.467525,-0.364607,-0.137987,-0.116775,-0.565267,-0.566418,-0.611324,1.765481
670,-0.655551,0.989591,0.148723,-0.507463,-0.866781,-0.565267,-0.467525,2.742680,-0.137987,-0.116775,-0.565267,-0.566418,1.635795,-0.566418
617,0.697271,0.989591,-0.830622,1.970587,-0.866781,-0.565267,2.138925,-0.364607,-0.137987,-0.116775,-0.565267,-0.566418,-0.611324,1.765481
373,-0.940356,0.989591,0.366902,1.970587,-0.866781,-0.565267,2.138925,-0.364607,-0.137987,-0.116775,-0.565267,-0.566418,-0.611324,1.765481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,-0.157143,0.989591,-1.305531,-0.507463,-0.866781,-0.565267,-0.467525,2.742680,-0.137987,-0.116775,1.769076,-0.566418,-0.611324,-0.566418
1147,-1.367563,-1.010519,0.206139,-0.507463,1.153694,-0.565267,-0.467525,-0.364607,-0.137987,-0.116775,-0.565267,1.765481,-0.611324,-0.566418
106,-1.438764,-1.010519,-0.371298,-0.507463,-0.866781,1.769076,-0.467525,-0.364607,-0.137987,-0.116775,-0.565267,-0.566418,-0.611324,1.765481
1041,-1.509965,0.989591,-1.243194,-0.507463,1.153694,-0.565267,-0.467525,-0.364607,-0.137987,-0.116775,1.769076,-0.566418,-0.611324,-0.566418


### Training

In [32]:
models = {
    '         OLS Model': LinearRegression(),
    '          L2 Model': Ridge(),
    '          L1 Model': Lasso(),
    '  ElasticNet Model': ElasticNet(),
    '       L2 CV Model': RidgeCV(),
    '       L1 CV Model': LassoCV(),
    'ElasticNetCV Model': ElasticNetCV()
}

In [33]:
for model in models.values():
    model.fit(X_train, y_train)

In [34]:
print("Model R^2 Scores:\n---------------------------\n")

for name, model in models.items():
    print(name, model.score(X_test, y_test))

Model R^2 Scores:
---------------------------

         OLS Model 0.7593545908497942
          L2 Model 0.7593579364036089
          L1 Model 0.7593697076110314
  ElasticNet Model 0.6722813607835507
       L2 CV Model 0.7593579364036287
       L1 CV Model 0.760087586650097
ElasticNetCV Model 0.13980401601000714
