In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pickle
from pycaret.regression import *
from pycaret.utils import version

In [2]:
df = pd.read_csv('df_with_target.csv')

df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',
       'sex_encoded', 'smoker_encoded', 'region_encoded'],
      dtype='object')

In [3]:
df.shape

(1327, 10)

# Seperate Columns

In [4]:
ignore_features = ['sex', 'region', 'smoker']

categorical_features = ['smoker_encoded', 'region_encoded',
                        'sex_encoded', 'children', ]

numeric_features = ['age', 'bmi']

# Scale the Numerical Columns

In [5]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit and Transform numeric features on the scaler
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# save the scaler
with open('df_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [6]:
# Creation Regression Experiment
exp = RegressionExperiment()

# Training with 80/20 split, ignoring non-encoded categorical features, setting categorical and numerical features, set the target variable, allowing shuffling with 5 folds
exp.setup(df, 
          ignore_features=ignore_features,
          categorical_features=categorical_features,
          numeric_features=numeric_features,
          target='charges',
          train_size=0.8,
          fold_shuffle=True,
          fold=5,
         )


Unnamed: 0,Description,Value
0,Session id,2136
1,Target,charges
2,Target type,Regression
3,Original data shape,"(1327, 10)"
4,Transformed data shape,"(1327, 15)"
5,Transformed train set shape,"(1061, 15)"
6,Transformed test set shape,"(266, 15)"
7,Ignore features,3
8,Ordinal features,2
9,Numeric features,2


<pycaret.regression.oop.RegressionExperiment at 0x23c3dd55710>

In [7]:
# Compare models
exp.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,2356.8918,18692251.3315,4283.5661,0.861,0.4341,0.3049,0.04
rf,Random Forest Regressor,2529.0894,20923384.458,4540.1109,0.8449,0.4733,0.3411,0.116
lightgbm,Light Gradient Boosting Machine,2691.8778,21431943.9317,4598.7797,0.8412,0.5368,0.3667,0.064
ada,AdaBoost Regressor,3670.5956,23169262.9993,4799.5709,0.8299,0.5843,0.6292,0.026
et,Extra Trees Regressor,2489.4989,24377979.3788,4894.6484,0.8198,0.4894,0.3323,0.08
ridge,Ridge Regression,4135.289,33898638.3832,5804.1224,0.7504,0.597,0.4233,0.022
lasso,Lasso Regression,4125.5809,33907647.6555,5803.9529,0.7503,0.5767,0.4213,0.354
llar,Lasso Least Angle Regression,4125.6004,33907840.0095,5803.9683,0.7503,0.5767,0.4214,0.022
br,Bayesian Ridge,4133.1313,33910643.3339,5804.8607,0.7503,0.5919,0.4228,0.028
lr,Linear Regression,4126.5317,33913737.9915,5804.4339,0.7502,0.5751,0.4215,0.468


In [8]:
# Choose model with low RMSE and high R^2 score
gbr_model = exp.create_model('gbr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2487.9486,18815132.0761,4337.6413,0.8649,0.4756,0.3664
1,2126.3543,13608953.3798,3689.0315,0.9084,0.3827,0.2741
2,2073.0669,12707700.4077,3564.7862,0.9195,0.4106,0.2884
3,2509.2502,22600466.6248,4753.9948,0.8181,0.4097,0.2663
4,2587.8391,25729004.1693,5072.3766,0.7939,0.4917,0.3291
Mean,2356.8918,18692251.3315,4283.5661,0.861,0.4341,0.3049
Std,213.2743,5029113.337,585.9292,0.049,0.042,0.0376


In [9]:
# Tune model to lower RMSE
gbr_model_tune = exp.tune_model(gbr_model, optimize='rmse')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3125.4808,21620420.6525,4649.7764,0.8448,0.4919,0.3987
1,2668.5372,15067850.6198,3881.7329,0.8986,0.3863,0.3192
2,2653.1491,15302970.0989,3911.9011,0.9031,0.4066,0.3084
3,3081.753,25769407.0989,5076.3577,0.7926,0.4185,0.2886
4,3038.8636,27286423.934,5223.6409,0.7815,0.5031,0.3704
Mean,2913.5568,21009414.4808,4548.6818,0.8441,0.4413,0.3371
Std,208.2068,5104848.5686,564.7198,0.051,0.0472,0.041


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [10]:
# Model doesn't seem to be able to be improved or tuned further

In [11]:
# Evaluate the model - look at Prediction Error, Feature Selection, and Feature Importance
exp.evaluate_model(gbr_model_tune)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [12]:
# Make predictions on holdout data, model seems to generalize fairly well
exp.predict_model(gbr_model_tune)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,2548.6876,23231631.7939,4819.9203,0.8094,0.429,0.3011


Unnamed: 0,age,bmi,children,sex_encoded,smoker_encoded,region_encoded,charges,prediction_label
143,-0.727521,1.379242,3,0,0,2,5138.256836,6695.053462
930,0.341052,-0.139566,2,1,0,0,32108.662109,10088.837992
570,1.623339,1.050265,1,0,0,0,31620.001953,16552.874297
408,-1.439902,-2.186075,0,1,0,1,1621.340210,1279.283626
540,1.694578,0.956748,0,0,0,2,13887.204102,14342.758138
...,...,...,...,...,...,...,...,...
1091,0.910958,0.050807,0,0,0,0,23045.566406,11913.295285
539,-1.368664,0.203606,2,0,0,2,3056.388184,5462.518918
51,0.626005,-0.429301,1,1,1,3,23568.271484,24343.105941
1032,-1.226188,-0.282346,0,1,0,0,2250.835205,3315.682910


In [13]:
# Finalize model by training on entire dataset (including holdout set)
final_model = exp.finalize_model(gbr_model_tune)

In [14]:
# Save model
exp.save_model(final_model, 'medical_insurance_prediction_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['age', 'bmi'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['smoker_encoded', 'region_encoded',
                                              'sex_encoded', 'children'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('ordinal_encoding',
                  TransformerWrapper(include=[...
 dtype: int64},
                                                                         {'col': 'sex_encoded',
                                                                          'data_type': dtype('float64'),
                                                                          'mapping': 0.0    0
 1.0    1
 NaN   -1
 dtype: int64}]))),
                 ('onehot_encoding',
                  Transform

# Conclusion
It appears that the only relevant features are smoker_encoded, BMI, and age. The line of best fit also appears to be about as good as possible.
There seem to be some entries that deter from the general pattern of the dataset.
I'm rather happy with this result as the R squared metric is rather high at around 0.85 while the line of best fit covers the pattern of the model very well.