# Insurance charges prediction 

The main goal of this project is to train  several regression models to predict the medical charges, based on the patient’s **age**, **gender**, **BMI**, **number of children**, **smoking habits**, and **region**.

In [2]:
import pandas as pd
from pycaret.regression import * 

In [3]:
df_insurance = pd.read_csv('insurance.csv')
df_insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Target variable : **Charges**

In [4]:
s = setup(df_insurance, target = 'charges',session_id=786,numeric_features=['children'])
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,2827.464,24109920.4389,4856.7161,0.8316,0.4382,0.3143,0.028
rf,Random Forest Regressor,3002.3647,26946114.3469,5151.4449,0.8111,0.4756,0.35,0.106
lightgbm,Light Gradient Boosting Machine,3155.7222,27157181.6696,5164.2948,0.8098,0.5918,0.3819,0.04
et,Extra Trees Regressor,2984.1902,29588336.3843,5386.0351,0.7931,0.4908,0.3458,0.096
ada,AdaBoost Regressor,4293.821,29530539.6619,5408.3823,0.7913,0.6135,0.6942,0.012
llar,Lasso Least Angle Regression,4423.9684,39887772.0457,6280.408,0.7206,0.5534,0.4434,0.007
lr,Linear Regression,4429.485,39912304.0,6282.4139,0.7204,0.5773,0.4435,1.169
lasso,Lasso Regression,4429.4766,39909849.6,6282.2352,0.7204,0.5759,0.4435,0.008
ridge,Ridge Regression,4440.8247,39913427.0,6282.6657,0.7204,0.5673,0.4458,0.008
br,Bayesian Ridge,4436.0394,39912448.7994,6282.522,0.7204,0.571,0.4448,0.023


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=786, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

We can notice that the **Gradient Boosting Regressor** gives the highest value for R-Squared : 0.8316 . 

WAYS TO IMPROVE THE PREDICTIONS : 
   
   
   
   
   
   **Remove the outliers**  : If we remove those outliers it may improve the predictions 
    
    
   **Identify hidden clusters** : Clustering in this dataset may be helpful. Take, for example, age and gender groups may cluster together and correlate with medical charges and thus improve the predictions.
    
    
   **Identify relationship between features and then interact these features** : 
    
    
    For example, perhaps we find that the age feature yields low predictive power, whereas combining the age and smoker features significantly improves the accuracy of the prediction model. Finding interactions within the dataset not only provide better predictions

In [5]:
# Identifying and removing outliers 

s = setup(df_insurance,target ='charges', session_id=786,numeric_features=['children'],normalize=True,transformation=True,transform_target=True,remove_outliers=True, outliers_threshold=0.05)

Unnamed: 0,Description,Value
0,session_id,786
1,Target,charges
2,Original Data,"(1338, 7)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(889, 9)"


In [6]:
# Identifying hidden clusters 

s = setup(df_insurance,target ='charges', session_id=786,numeric_features=['children'],normalize=True,transformation=True,transform_target=True,remove_outliers=True, outliers_threshold=0.05,create_clusters=True)


Unnamed: 0,Description,Value
0,session_id,786
1,Target,charges
2,Original Data,"(1338, 7)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(889, 12)"


In [7]:
# Finding interactions within the dataset

s = setup(df_insurance,target ='charges', session_id=786,numeric_features=['children'],normalize=True,transformation=True,transform_target=True,remove_outliers=True, outliers_threshold=0.05,feature_interaction=True, feature_ratio=True, interaction_threshold=0.5,create_clusters=True)

Unnamed: 0,Description,Value
0,session_id,786
1,Target,charges
2,Original Data,"(1338, 7)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(889, 132)"


In [13]:
compare_models(exclude=['tr'])


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,2587.6981,25854332.2547,5017.6137,0.7855,0.4395,0.2418,0.374
lightgbm,Light Gradient Boosting Machine,2797.6578,27579999.5266,5200.3842,0.768,0.4484,0.2461,0.099
gbr,Gradient Boosting Regressor,2684.394,28148789.0102,5232.305,0.7654,0.4341,0.2397,0.18
ridge,Ridge Regression,2821.4638,28294942.0,5267.23,0.7629,0.4026,0.207,0.01
lr,Linear Regression,2825.5068,28362846.6,5273.7486,0.7624,0.4031,0.2071,0.011
br,Bayesian Ridge,2827.2343,28465991.7128,5282.3214,0.7614,0.4023,0.2092,0.016
et,Extra Trees Regressor,2733.7789,29631090.641,5373.503,0.7546,0.466,0.267,0.302
ada,AdaBoost Regressor,4022.8754,29807936.3969,5424.7763,0.7497,0.531,0.5255,0.059
omp,Orthogonal Matching Pursuit,2919.8906,30801285.9962,5463.2341,0.742,0.4056,0.2109,0.012
huber,Huber Regressor,3040.1457,43420682.1214,6505.0754,0.6392,0.4305,0.1393,0.774


PowerTransformedTargetRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                                max_depth=None, max_features='auto',
                                max_leaf_nodes=None, max_samples=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None, min_samples_leaf=1,
                                min_samples_split=2,
                                min_weight_fraction_leaf=0.0, n_estimators=100,
                                n_jobs=-1, oob_score=False,
                                power_transformer_method='box-cox',
                                power...
                                regressor=RandomForestRegressor(bootstrap=True,
                                                                ccp_alpha=0.0,
                                                                criterion='mse',
                                                                max_depth=None,
                      

In [14]:
model = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2032.8824,17125664.2927,4138.3166,0.8652,0.4635,0.2671
1,2826.7594,31601275.6102,5621.5012,0.7149,0.5105,0.2139
2,2103.7589,14127188.3736,3758.6152,0.8057,0.3504,0.2443
3,1950.1779,18314533.6294,4279.5483,0.8452,0.4193,0.2028
4,3031.621,32254696.2007,5679.3218,0.713,0.483,0.2724
5,3009.2386,30372247.8285,5511.1022,0.7251,0.4535,0.2441
6,2338.6312,23601549.5211,4858.1426,0.7916,0.4816,0.2376
7,3168.9234,35196235.3822,5932.6415,0.8019,0.4312,0.287
8,2141.6695,17836812.3095,4223.365,0.8326,0.3672,0.2045
9,3273.3184,38113119.3994,6173.5824,0.7603,0.4345,0.2448


In [15]:
predictions = predict_model(model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,1961.9837,14536652.9719,3812.6963,0.9047,0.3807,0.2228


In [16]:
finalize_model(model)

PowerTransformedTargetRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                                max_depth=None, max_features='auto',
                                max_leaf_nodes=None, max_samples=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None, min_samples_leaf=1,
                                min_samples_split=2,
                                min_weight_fraction_leaf=0.0, n_estimators=100,
                                n_jobs=-1, oob_score=False,
                                power_transformer_method='box-cox',
                                power...
                                regressor=RandomForestRegressor(bootstrap=True,
                                                                ccp_alpha=0.0,
                                                                criterion='mse',
                                                                max_depth=None,
                      

In [18]:
#Saving model 
save_model(model,'insurance_kaggle')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['children'],
                                       target='charges', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeri...
                                                                                  ccp_alpha=0.0,
                                                                                  criterion='mse',
                                                                                  max_depth=None,
                   