In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import scale
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Exporting the clean data from the Report 1 
clean_df = pd.read_csv("exported_dataframe.csv")

In [3]:
# Dropping Location columns as it is not a good indicator for prediction as per EDA, VIF and Clustering methods applied before
df = clean_df.drop(['Location'], axis = 1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   years_of_insurance_with_us      25000 non-null  float64
 1   regular_checkup_lasy_year       25000 non-null  float64
 2   adventure_sports                25000 non-null  float64
 3   Occupation                      25000 non-null  float64
 4   visited_doctor_last_1_year      25000 non-null  float64
 5   cholesterol_level               25000 non-null  float64
 6   daily_avg_steps                 25000 non-null  float64
 7   age                             25000 non-null  float64
 8   heart_decs_history              25000 non-null  float64
 9   other_major_decs_history        25000 non-null  float64
 10  Gender                          25000 non-null  float64
 11  avg_glucose_level               25000 non-null  float64
 12  bmi                             

<B> As part of 1 A. We will be using 6 Models - 
    Linear Regression, 
    Lasso Regression, 
    Elastic Net Regression, 
    Decision Tree Regressor,
    KNN, 
    Neural Network,
   </B>

<b> #1 Model - Linear Regression <b> 

In [5]:
# Copy all the predictor variables into X dataframe
X = df.drop('insurance_cost', axis=1)

# Copy target into the y dataframe. 
y = df[['insurance_cost']]

<b> Splitting the data into Train and Test set. 75% and 25% </b>

In [15]:
# Split X and y into training and test set in 75:25 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 , random_state=1, stratify=y)
Default_train = pd.concat([X_train,y_train], axis=1)
Default_test = pd.concat([X_test,y_test], axis=1)

<b> Count of rows in Train set and Test set </b> 

In [20]:
print("The count of values in Train dataset")
len(Default_train.index)

The count of values in Train dataset


18750

In [21]:
print("The count of values in Test dataset")
len(Default_test.index)

The count of values in Test dataset


6250

In [6]:
# invoke the LinearRegression function and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression()

In [7]:
# Let us explore the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for years_of_insurance_with_us is -6.803374896227611
The coefficient for regular_checkup_lasy_year is -435.17047278889765
The coefficient for adventure_sports is 141.74793926320203
The coefficient for Occupation is -2.416491210399994
The coefficient for visited_doctor_last_1_year is -59.81834800163315
The coefficient for cholesterol_level is 30.372145613819402
The coefficient for daily_avg_steps is -0.03920273552472452
The coefficient for age is 3.0117264671940376
The coefficient for heart_decs_history is 257.77626227838033
The coefficient for other_major_decs_history is 66.36629958889448
The coefficient for Gender is -40.47023484115922
The coefficient for avg_glucose_level is -0.40447477258946946
The coefficient for bmi is -6.297635488500966
The coefficient for smoking_status is -5.960965274682659
The coefficient for weight is 1486.844620273738
The coefficient for covered_by_any_other_company is -1209.0873509495225
The coefficient for Alcohol is 19.465470952155275
The 

In [8]:
# Let us check the intercept for the model
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))

The intercept for our model is -76760.18302662368


In [9]:
# R square on training data
regression_model.score(X_train, y_train)

0.9449211409874587

In [10]:
# R square on testing data
regression_model.score(X_test, y_test)

0.9438339025535056

In [11]:
#RMSE on Training data
predicted_train=regression_model.fit(X_train, y_train).predict(X_train)
np.sqrt(metrics.mean_squared_error(y_train,predicted_train))

3361.38624609071

In [12]:
#RMSE on Testing data
predicted_test=regression_model.fit(X_train, y_train).predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test,predicted_test))

3395.036008702231

In [13]:
#display adjusted R-squared
1 - (1-regression_model.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.94460493756696

In [14]:
from sklearn.metrics import mean_absolute_percentage_error

In [15]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, predicted_train)

0.15263572729369276

In [16]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, predicted_test)

0.1527539202707772

<b> #2 Lasso Regression </b>

In [17]:
from sklearn.linear_model import Lasso

In [18]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
pred_train_lasso= lasso.predict(X_train)
print ("Lasso model:", (lasso.coef_)) 
# Coeff is feature importance

Lasso model: [-6.79949038e+00 -4.35163440e+02  1.41614924e+02 -2.39734257e+00
 -5.98099068e+01  3.03710361e+01 -3.91984088e-02  3.01168305e+00
  2.57604112e+02  6.62700276e+01 -4.04358545e+01 -4.04477065e-01
 -6.29592186e+00 -5.96040566e+00  1.48684476e+03 -1.20903344e+03
  1.94407473e+01  8.22193724e+00  1.59913787e+02 -4.57606167e+00]


In [19]:
# R square on training  data
print(lasso.score(X_train, y_train))

0.9449211409589048


In [20]:
# RMSE on training  data
print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))

3361.386246962013


In [21]:
pred_test_lasso= lasso.predict(X_test)

In [22]:
# R square on test  data
print(lasso.score(X_test, y_test))

0.9438339347118933


In [23]:
# RMSE on training  data
print(np.sqrt(mean_squared_error(y_test,pred_test_lasso)))

3395.0350367733918


In [24]:
#display adjusted R-squared
1 - (1-lasso.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.9446049455938231

In [25]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_lasso)

0.15263568154556945

In [26]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_lasso)

0.15275385049178591

<b> #3 Elastic Net Regression </b>

In [27]:
from sklearn.linear_model import ElasticNet

In [28]:
model_enet = ElasticNet(alpha = 0.1)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print(r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))
print(r2_score(y_test, pred_test_enet))

3363.3807892281325
0.9448557573956046
3396.7314654076436
0.943777790651201


In [29]:
#display adjusted R-squared
1 - (1-model_enet.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.9445418220544701

In [30]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_enet)

0.15238425186908777

In [31]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_enet)

0.15243537468841653

<b> #4 Decision Tree Regressor <b>

In [105]:
from sklearn.tree import DecisionTreeRegressor

In [106]:
dt = DecisionTreeRegressor()

In [107]:
dt.fit(X_train, y_train)

DecisionTreeRegressor()

In [108]:
pred_train_dt= dt.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_dt)))
print(r2_score(y_train, pred_train_dt))

0.0
1.0


In [109]:
pred_test_dt= dt.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_dt)))
print(r2_score(y_test, pred_test_dt))

4420.683363644132
0.9047720801541184


In [84]:
pred_test_dt=dt.predict(X_test)

In [85]:
#display adjusted R-squared
1 - (1-dt.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.976210647720655

In [86]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_dt)

0.0

In [87]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_dt)

0.16175369599134345

<b> #5 KNN </b>

<b>  For KNN and Neural Network, we will need Scaler as they are dependent on distance and weights respectively. </b>

In [110]:
from sklearn.neighbors import KNeighborsRegressor

In [161]:
# For KNN and Neural Network, we will need Scaling as they are dependent on distance and weights respectively. 
df_scaled = df

In [162]:
# Copy all the predictor variables into X dataframe
X = df_scaled .drop('insurance_cost', axis=1)

# Copy target into the y dataframe. 
y = df_scaled [['insurance_cost']]

In [163]:
from scipy.stats import zscore
XScaled  = X.apply(zscore)  # convert all attributes to Z scale 
XScaled.describe()

Unnamed: 0,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,other_major_decs_history,Gender,avg_glucose_level,bmi,smoking_status,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,-6.197443e-16,-1.05647e-15,-1.265019e-15,-7.321233e-16,4.659961e-16,-5.448442e-16,8.818057e-16,-8.559375e-17,-1.325162e-17,3.912026e-16,-7.31788e-16,-8.428813e-18,4.6469490000000006e-17,-3.528555e-16,4.625011e-16,-1.074723e-15,3.292149e-15,-8.366641e-18,-8.179235e-17,-3.092238e-16
std,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002
min,-1.56875,-0.6450427,-0.298316,-1.632547,-2.71907,-1.002742,-2.754823,-1.795369,-0.2404123,-0.3299154,-0.7227365,-1.762039,-2.472117,-1.451776,-2.103001,-1.515679,-1.961268,-1.545002,-1.489652,-2.063467
25%,-0.801455,-0.6450427,-0.298316,-0.2881346,-0.9672049,-1.002742,-0.6576392,-0.8641071,-0.2404123,-0.3299154,-0.7227365,-0.8693024,-0.6590968,-0.5363603,-0.8161376,-1.515679,-0.371085,0.00832584,-0.8980412,-0.9050151
50%,-0.03415997,-0.6450427,-0.298316,-0.2881346,-0.09127219,-0.2101859,-0.05425957,0.005071034,-0.2404123,-0.3299154,-0.7227365,0.007492612,-0.1022408,-0.5363603,0.0417716,0.6597703,-0.371085,0.00832584,0.28518,0.2534369
75%,0.733135,0.1886905,-0.298316,1.056278,0.7846605,0.5823702,0.5892478,0.8742492,-0.2404123,-0.3299154,1.38363,0.8683459,0.5064158,1.294471,0.6852035,0.6597703,1.219098,0.00832584,0.8767906,0.8326628
max,1.50043,3.523623,3.35215,1.056278,7.792122,2.167482,2.805643,1.805511,4.15952,3.031081,1.38363,1.745141,8.962858,1.294471,2.615499,0.6597703,1.219098,1.561654,2.060012,1.527734


In [164]:
# Split X and y into training and test set in 75:25 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.25 , random_state=1, stratify=y)

In [115]:
NNH = KNeighborsRegressor(n_neighbors= 5 , weights = 'distance' )

In [116]:
NNH.fit(X_train, y_train)

KNeighborsRegressor(weights='distance')

In [117]:
predicted_labels_train = NNH.predict(X_train)
NNH.score(X_train, y_train)

0.9999999999999997

In [118]:
predicted_labels_test = NNH.predict(X_test)
NNH.score(X_test, y_test)

0.7434544196160349

In [18]:
#display adjusted R-squared
1 - (1-NNH.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

-0.5813921223320413

In [22]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, predicted_labels_train)

3.05654248666775e-09

In [23]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, predicted_labels_test)

0.32852809100577085

<b> #6 Neural Network </b>

In [24]:
from sklearn.neural_network import MLPRegressor

In [25]:
mlp = MLPRegressor()

In [26]:
mlp.fit(X_train, y_train)

MLPRegressor()

In [27]:
pred_train_mlp= mlp.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_mlp)))
print(r2_score(y_train, pred_train_mlp))

4347.945466706885
0.9078455578739867


In [28]:
pred_test_mlp = mlp.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_mlp)))
print(r2_score(y_test, pred_test_mlp))

4374.838611635203
0.9067369632897377


In [29]:
#display adjusted R-squared
1 - (1-mlp.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

-2222440.9441541247

In [30]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_mlp)

0.20004231855583043

In [31]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_mlp)

0.19949486439341207

<b> Ensemble Models - Random Forest , Gradient Boost , XGBoost  </b>

<b> Random Forest Regressor </b> 

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
rfr = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [34]:
rfr.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [35]:
pred_train_rfr= rfr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rfr)))
print(r2_score(y_train, pred_train_rfr))

1144.530093774009
0.9936143879464452


In [36]:
pred_test_rfr = rfr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rfr)))
print(r2_score(y_test, pred_test_rfr))

3133.9876832740974
0.9521391893787464


In [37]:
#display adjusted R-squared
1 - (1-rfr.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.983229259120087

In [38]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_rfr)

0.04507272583341721

In [39]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_rfr)

0.12361049401929315

<b> Gradient Boost </b>

In [40]:
from sklearn.ensemble import GradientBoostingRegressor

In [41]:
gbr = GradientBoostingRegressor()

In [42]:
gbr.fit(X_train, y_train)

GradientBoostingRegressor()

In [43]:
pred_train_gbr= gbr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_gbr)))
print(r2_score(y_train, pred_train_gbr))

2978.2050417117835
0.9567628273882008


In [44]:
pred_test_gbr = gbr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_gbr)))
print(r2_score(y_test, pred_test_gbr))

3072.8868332370853
0.953987205250881


In [45]:
#display adjusted R-squared
1 - (1-gbr.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.9560335526783004

In [46]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_gbr)

0.12024866876768003

In [47]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_gbr)

0.12333915086811069

<b> XGBoost </b>

In [48]:
import xgboost as xgb

In [50]:
xgboost = xgb.XGBRegressor()

In [51]:
xgboost.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [52]:
pred_train_xgboost= xgboost.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_xgboost)))
print(r2_score(y_train, pred_train_xgboost))

2296.7000552574423
0.9742867685330494


In [53]:
pred_test_xgboost = xgboost.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_xgboost)))
print(r2_score(y_test, pred_test_xgboost))

3188.3492915854
0.9504644184409494


In [54]:
#display adjusted R-squared
1 - (1-xgboost.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.9683041508420924

In [55]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_xgboost)

0.08994131811253583

In [56]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_xgboost)

0.12646674441961947

<b> Model Tuning - Hyper parameter tuning of the above Models </b>

<b> Tuning Linear Regression </b>

In [60]:
 from sklearn.model_selection import GridSearchCV

In [61]:
from sklearn.linear_model import SGDRegressor

In [66]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
    'max_iter': [1000,1500],
    'tol': [1e-2],
    'penalty': ['l2'], 
    'random_state': [1,30,45],
    'learning_rate':['invscaling']
}

ft_lr = SGDRegressor()

grid_search_ft_lr = GridSearchCV(estimator = ft_lr, param_grid = param_grid_search, cv = 10)

In [67]:
grid_search_ft_lr.fit(X_train, y_train)
print(grid_search_ft_lr.best_params_)
best_grid_lr = grid_search_ft_lr.best_estimator_
best_grid_lr 

{'learning_rate': 'invscaling', 'max_iter': 1000, 'penalty': 'l2', 'random_state': 30, 'tol': 0.01}


SGDRegressor(random_state=30, tol=0.01)

In [68]:
pred_train_ftlr= grid_search_ft_lr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftlr)))
print(r2_score(y_train, pred_train_ftlr))

3550053853466174.5
-6.143529953895646e+22


In [69]:
pred_test_ftlr= grid_search_ft_lr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftlr)))
print(r2_score(y_test, pred_test_ftlr))

3539007160295076.5
-6.103070151759108e+22


In [70]:
#display adjusted R-squared
1 - (1-grid_search_ft_lr.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

-6.138322978465769e+22

In [71]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftlr)

197870071564.1879

In [72]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftlr)

197326437607.8904

<b> Fine tune Lasso Regression </b>

In [76]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
    'alpha': [1,0.1,0.01],
    'max_iter': [1000,1500],
    'tol': [1e-2],
    'random_state': [1,30,45],
    'selection':['cyclic','random']
}

lsso_ft = Lasso()

grid_search_ft_lasso = GridSearchCV(estimator = lsso_ft, param_grid = param_grid_search, cv = 10)

In [77]:
grid_search_ft_lasso.fit(X_train, y_train)
print(grid_search_ft_lasso.best_params_)
best_grid_lasso = grid_search_ft_lasso.best_estimator_
best_grid_lasso

{'alpha': 1, 'max_iter': 1000, 'random_state': 1, 'selection': 'random', 'tol': 0.01}


Lasso(alpha=1, random_state=1, selection='random', tol=0.01)

In [78]:
pred_train_ftlasso = grid_search_ft_lasso.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftlasso)))
print(r2_score(y_train, pred_train_ftlasso))

3361.394946569643
0.9449208558592184


In [79]:
pred_test_ftlasso= grid_search_ft_lasso.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftlasso)))
print(r2_score(y_test, pred_test_ftlasso))

3394.9473236476388
0.9438368368542597


In [81]:
#display adjusted R-squared
1 - (1-grid_search_ft_lasso.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.9446054579383294

In [82]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftlasso)

0.15263146435499464

In [83]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftlasso)

0.15274730890248825

<b> Fine Tune Elastic Net </b>

In [154]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
    'alpha': [1,0.1,0.01],
    'max_iter': [1000,1500],
    'tol': [0.01,0.1],
    'random_state': [1,30,45],
    'selection':['cyclic','random']
}

elasticnet_ft = ElasticNet()

grid_search_ft_elastic = GridSearchCV(estimator = elasticnet_ft, param_grid = param_grid_search, cv = 10)

In [155]:
grid_search_ft_elastic.fit(X_train, y_train)
print(grid_search_ft_elastic.best_params_)
best_grid_elastic = grid_search_ft_elastic.best_estimator_
best_grid_elastic

{'alpha': 0.01, 'max_iter': 1000, 'random_state': 1, 'selection': 'random', 'tol': 0.01}


ElasticNet(alpha=0.01, random_state=1, selection='random', tol=0.01)

In [151]:
pred_train_ftelastic = grid_search_ft_elastic.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftelastic)))
print(r2_score(y_train, pred_train_ftelastic))

3361.4182259087747
0.9449200929554027


In [152]:
pred_test_ftelastic= grid_search_ft_elastic.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftelastic)))
print(r2_score(y_test, pred_test_ftelastic))

3395.0204177667374
0.9438344184125359


In [91]:
#display adjusted R-squared
1 - (1-grid_search_ft_elastic.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.944604280091486

In [92]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftelastic)

0.15259712750112908

In [93]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftelastic)

0.15270890650413593

<b> Fine tune Decision Tree Regressor </b>

In [144]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
    'criterion': ['squared_error','friedman_mse','poisson'],
    'max_depth': [5,10,15,20],
    'min_samples_leaf': [40,50,60,70], 
    'min_samples_split': [100,150,200,350]
}

dt_ft = DecisionTreeRegressor()

grid_search_dtft = GridSearchCV(estimator = dt_ft, param_grid = param_grid_search, cv = 10)

In [145]:
grid_search_dtft.fit(X_train, y_train)
print(grid_search_dtft.best_params_)
best_grid_dtft = grid_search_dtft.best_estimator_
best_grid_dtft

{'criterion': 'friedman_mse', 'max_depth': 10, 'min_samples_leaf': 60, 'min_samples_split': 200}


DecisionTreeRegressor(criterion='friedman_mse', max_depth=10,
                      min_samples_leaf=60, min_samples_split=200)

In [146]:
pred_train_ftdt = grid_search_dtft.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftdt)))
print(r2_score(y_train, pred_train_ftdt))

2995.211746316835
0.9562676154746617


In [147]:
pred_test_ftdt= grid_search_dtft.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftdt)))
print(r2_score(y_test, pred_test_ftdt))

3120.6987306910387
0.9525442143238699


In [102]:
#display adjusted R-squared
1 - (1-grid_search_dtft.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.9553007431971461

In [103]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftdt)

0.11885225057707784

In [104]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftdt)

0.12259928008553725

In [148]:
# Getting the important features from the Decision Tree fine tuned Model
grid_search_dtft.best_estimator_.feature_importances_

array([3.50907037e-04, 1.13890868e-03, 0.00000000e+00, 3.69483550e-05,
       2.78456005e-05, 2.34511769e-05, 1.23516911e-04, 1.27951856e-04,
       0.00000000e+00, 0.00000000e+00, 2.03446552e-05, 6.24392199e-05,
       1.19770278e-04, 1.44902108e-05, 9.95260638e-01, 2.08792453e-03,
       2.94109717e-05, 2.42246059e-05, 4.41937942e-04, 1.09290269e-04])

<b> Fine Tune KNN  </b>

<b> Have copied the scaled values code from above KNN model. This is for KNN and Neural network as they require scaled values </b> 

In [157]:
from sklearn.neighbors import KNeighborsRegressor

In [165]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
    'n_neighbors': [10,25,50],
    'weights': ['unifrom','distance'],
    'algorithm': ['auto'], 
    'leaf_size': [25,30,50],
    
}

knn_ft = KNeighborsRegressor()

grid_search_knnft = GridSearchCV(estimator = knn_ft, param_grid = param_grid_search, cv = 10)

In [166]:
grid_search_knnft.fit(X_train, y_train)
print(grid_search_knnft.best_params_)
best_grid_dtknn = grid_search_knnft.best_estimator_
best_grid_dtknn

{'algorithm': 'auto', 'leaf_size': 25, 'n_neighbors': 10, 'weights': 'distance'}


KNeighborsRegressor(leaf_size=25, n_neighbors=10, weights='distance')

In [167]:
pred_train_ftknn = grid_search_knnft.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftknn)))
print(r2_score(y_train, pred_train_ftknn))

0.00047723994573107653
0.9999999999999989


In [42]:
pred_test_ftknn= grid_search_knnft.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftknn)))
print(r2_score(y_test, pred_test_ftknn))

6980.123551878591
0.7625830621783506


In [43]:
#display adjusted R-squared
1 - (1-grid_search_knnft.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

-0.06982625703597822

In [44]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftknn)

6.178095525768879e-09

In [45]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftknn)

0.32884008005242277

<b> Fine tune  Neural Network </b>

In [46]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
          'hidden_layer_sizes': [50,100,150],
          'activation': ['relu','tanh'],
          'alpha': [0.001, 0.05],
          'learning_rate': ['constant','adaptive'],
          'solver': ['adam','sgd'],
}

nn_ft = MLPRegressor()

grid_search_nnft = GridSearchCV(estimator = nn_ft , param_grid = param_grid_search, cv = 10)

In [48]:
grid_search_nnft.fit(X_train, y_train)
print(grid_search_nnft.best_params_)
best_grid_ftnn = grid_search_nnft.best_estimator_
best_grid_ftnn

{'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': 100, 'learning_rate': 'adaptive', 'solver': 'adam'}


MLPRegressor(alpha=0.001, hidden_layer_sizes=100, learning_rate='adaptive')

In [49]:
pred_train_ftnn = grid_search_nnft.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftnn)))
print(r2_score(y_train, pred_train_ftnn))

4121.430066082736
0.9171974011982078


In [51]:
pred_test_ftnn= grid_search_nnft.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftnn)))
print(r2_score(y_test, pred_test_ftnn))

4160.652631158381
0.9156454726604597


In [52]:
#display adjusted R-squared
1 - (1-grid_search_nnft.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

-2083741.2246898168

In [53]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftnn)

0.18548228518592125

In [54]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftnn)

0.1842300620468434

<b> Fine Tune Random Forest </b>

<b> Have updated the value of X_train, X_test from scaled to non-scaled for Random Forest, GradientBoost and XGBoost </b>

In [68]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
  'max_depth': [5,6,8,10],
    'max_features': [ 5,8,10,12],
    'min_samples_leaf': [8,10,12],
    'min_samples_split': [400,450,500], 
    'n_estimators': [150,250,300],
    'random_state':[1,30,42]
}

rf_ft = RandomForestRegressor()

grid_search_rfft = GridSearchCV(estimator = rf_ft , param_grid = param_grid_search, cv = 10)

In [69]:
grid_search_rfft.fit(X_train, y_train)
print(grid_search_rfft.best_params_)
best_grid_ftrf = grid_search_rfft.best_estimator_
best_grid_ftrf

{'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 10, 'min_samples_split': 500, 'n_estimators': 150, 'random_state': 1}


RandomForestRegressor(max_depth=8, max_features=8, min_samples_leaf=10,
                      min_samples_split=500, n_estimators=150, random_state=1)

In [70]:
pred_train_ftrf = grid_search_rfft.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftrf)))
print(r2_score(y_train, pred_train_ftrf))

3588.604078189293
0.9372231976603409


In [71]:
pred_test_ftrf= grid_search_rfft.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftrf)))
print(r2_score(y_test, pred_test_ftrf))

3648.8366635049065
0.9351224672735148


In [73]:
#display adjusted R-squared
1 - (1-grid_search_rfft.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.936647183682368

In [74]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftrf)

0.15518115353934045

In [75]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftrf)

0.1577720888690866

<b> Fine Tune Gradient Boosting </b>

In [85]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
        'max_depth': [8,10,12],
        'max_features': [5,8,10,12],
        'min_samples_leaf': [8,10,15],
        'min_samples_split': [350, 400, 450,500], 
        'n_estimators': [150,200,250],
        'random_state':[1,30,42],
          'learning_rate': [0.01,0.001,0.05],
          'loss': ['ls','huber']
}

gbr_ft = GradientBoostingRegressor()

grid_search_gbrft = GridSearchCV(estimator = gbr_ft , param_grid = param_grid_search, cv = 10)

In [86]:
grid_search_gbrft.fit(X_train, y_train)
print(grid_search_gbrft.best_params_)
best_grid_ftgbr = grid_search_gbrft.best_estimator_
best_grid_ftgbr

{'learning_rate': 0.001, 'loss': 'ls', 'max_depth': 10, 'max_features': 8, 'min_samples_leaf': 10, 'min_samples_split': 450, 'n_estimators': 200, 'random_state': 42}


GradientBoostingRegressor(learning_rate=0.001, max_depth=10, max_features=8,
                          min_samples_leaf=10, min_samples_split=450,
                          n_estimators=200, random_state=42)

In [87]:
pred_train_ftgbr = grid_search_gbrft.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftgbr)))
print(r2_score(y_train, pred_train_ftgbr))

11991.597344888196
0.29902557637960736


In [88]:
pred_test_ftgbr= grid_search_gbrft.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftgbr)))
print(r2_score(y_test, pred_test_ftgbr))

12007.65930208333
0.2974096292320564


In [89]:
#display adjusted R-squared
1 - (1-grid_search_gbrft.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.29805990603392174

In [90]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftgbr)

0.6699760153443384

In [91]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftgbr)

0.6703263522541194

<b> Fine Tune XGBoost </b>

In [128]:
# setting parameters for Grid Search, to find the best combinations
param_grid_search = {
        'max_depth': [6,8,10],
        'colsample_bytree': [0.3,0.5,1],
        'gamma': [350,400,450,500], 
        'tree_method': ['auto','approx', 'hist'],
        'random_state':[1,42,30],
        'learning_rate': [0.001,0.01,0.3]
          
}

xgb_ft = xgb.XGBRegressor()

grid_search_xgbft = GridSearchCV(estimator = xgb_ft , param_grid = param_grid_search, cv = 10)

In [129]:
grid_search_xgbft.fit(X_train, y_train)
print(grid_search_xgbft.best_params_)
best_grid_ftxgb = grid_search_xgbft.best_estimator_
best_grid_ftxgb

{'colsample_bytree': 0.3, 'gamma': 400, 'learning_rate': 0.3, 'max_depth': 6, 'random_state': 42, 'tree_method': 'auto'}


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, enable_categorical=False,
             gamma=400, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='auto', validate_parameters=1, verbosity=None)

In [130]:
pred_train_ftxgb = grid_search_xgbft.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_ftxgb)))
print(r2_score(y_train, pred_train_ftxgb))

2513.2017390800197
0.9692104883439813


In [131]:
pred_test_ftxgb= grid_search_xgbft.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_ftxgb)))
print(r2_score(y_test, pred_test_ftxgb))

3229.562848293763
0.949175518089095


In [132]:
#display adjusted R-squared
1 - (1-grid_search_xgbft.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)

0.9641716754549855

In [133]:
#display MAPE for Train dataset
mean_absolute_percentage_error(y_train, pred_train_ftxgb)

0.0980057501282517

In [134]:
#display MAPE for Test dataset
mean_absolute_percentage_error(y_test, pred_test_ftxgb)

0.13100370578649226

In [141]:
# Getting the important features from the XGBoost fine tuned Model
grid_search_xgbft.best_estimator_.feature_importances_

array([0.0037047 , 0.05291879, 0.0126059 , 0.00198352, 0.0018668 ,
       0.00132097, 0.0023176 , 0.00304388, 0.00285855, 0.0012398 ,
       0.00141989, 0.00240671, 0.00490852, 0.00365709, 0.6787945 ,
       0.01669851, 0.00286816, 0.00310947, 0.19950473, 0.00277184],
      dtype=float32)