# HistGradientBoostingRegressor

In [149]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import Normalizer, QuantileTransformer, PowerTransformer
from feature_engine.transformation import LogTransformer, LogCpTransformer, ReciprocalTransformer
from feature_engine.transformation import PowerTransformer, BoxCoxTransformer, YeoJohnsonTransformer
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.discretisation import DecisionTreeDiscretiser

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from feature_engine.encoding import CountFrequencyEncoder, MeanEncoder, WoEEncoder, PRatioEncoder
from feature_engine.encoding import DecisionTreeEncoder, RareLabelEncoder
from category_encoders import BackwardDifferenceEncoder, BaseNEncoder, CatBoostEncoder
from category_encoders import GLMMEncoder, HashingEncoder, HelmertEncoder, JamesSteinEncoder
from category_encoders import LeaveOneOutEncoder, PolynomialEncoder, SumEncoder

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from math import sqrt
from feature_engine.selection import RecursiveFeatureElimination, RecursiveFeatureAddition

In [59]:
df = pd.read_csv('ObesityDataSet copy.csv')

In [60]:
df['BMI'] = df['Weight']/(df['Height']**2)

In [61]:
df.drop(26, inplace = True)

In [62]:
fixed_df = df.drop(['Height', 'Weight', 'NObeyesdad'], axis = 1)

# Creating A Scaler

In [11]:
class ScalerSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, scaler = StandardScaler()):
        super().__init__()
        self.scaler = scaler
    
    def fit(self, X, y = None):
        return self.scaler.fit(X)
    
    def transform(self, X, y = None):
        return self.scaler.transform(X)

In [12]:
class EncoderSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, encoder = OneHotEncoder(handle_unknown = 'ignore')):
        super().__init__()
        self.encoder = encoder
    
    def fit(self, X, y = None):
        return self.encoder.fit(X)
    
    def transform(self, X, y = None):
        return self.encoder.transform(X)

# Train Test Split

In [63]:
X = fixed_df.drop(['BMI'], axis = 1)
y = fixed_df['BMI']

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [65]:
cat = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
cont = ['Age', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Pipeline

In [66]:
col_tran = ColumnTransformer(
    [("scaler", ScalerSelector(), cont),
     ("encoder", EncoderSelector(), cat)])

In [67]:
hgbr_pipe = Pipeline([
    ('col_tran', col_tran),
    ('hgbt', HistGradientBoostingRegressor(warm_start = True, random_state = 0))
])

# Grid Search for Scaler

In [68]:
params_1 = {'col_tran__scaler': [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(),
                                Normalizer(), QuantileTransformer(), PowerTransformer(), LogTransformer(),
                                LogCpTransformer(), ReciprocalTransformer(), PowerTransformer(), BoxCoxTransformer(),
                                YeoJohnsonTransformer(), EqualFrequencyDiscretiser(), EqualWidthDiscretiser(), 
                                 DecisionTreeDiscretiser()]}

In [69]:
hrbr_gs_1 = GridSearchCV(hgbr_pipe,
                        params_1,
                        scoring = 'neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 8, 
                        verbose = 1)

In [70]:
hrbr_gs_1.fit(X_train, y_train)

Fitting 8 folds for each of 16 candidates, totalling 128 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed:  1.6min finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('col_tran',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         ScalerSelector(),
                                                                         ['Age',
                                                                          'FCVC',
                                                                          'NCP',
                                                                          'CH2O',
                                                                          'FAF',
                                                                          'TUE']),
                                                                        ('encoder',
                                                                         EncoderSelector(),
                                                                         ['Gender',
     

In [71]:
hrbr_gs_1.best_score_

-9.14394708232836

In [72]:
hrbr_gs_1.best_params_

{'col_tran__scaler': StandardScaler()}

In [76]:
hrbr_gs_1.best_estimator_

Pipeline(steps=[('col_tran',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Age', 'FCVC', 'NCP', 'CH2O',
                                                   'FAF', 'TUE']),
                                                 ('encoder', EncoderSelector(),
                                                  ['Gender',
                                                   'family_history_with_overweight',
                                                   'FAVC', 'CAEC', 'SMOKE',
                                                   'SCC', 'CALC',
                                                   'MTRANS'])])),
                ('hgbt',
                 HistGradientBoostingRegressor(random_state=0,
                                               warm_start=True))])

# Grid Search for Standard Scaler & Encoder

In [77]:
hgbr_pipe_2 = hrbr_gs_1.best_estimator_

In [78]:
params_2 = {'col_tran__scaler__with_mean': [True, False],
            'col_tran__scaler__with_std': [True, False],
           'col_tran__encoder':[OneHotEncoder(handle_unknown = 'ignore'), LabelEncoder(), OrdinalEncoder(),
                               CountFrequencyEncoder(), MeanEncoder(), WoEEncoder(), PRatioEncoder(),
                               DecisionTreeEncoder(), RareLabelEncoder(), BackwardDifferenceEncoder(), 
                                BaseNEncoder(), CatBoostEncoder(), GLMMEncoder(), HashingEncoder(),
                               HelmertEncoder(), JamesSteinEncoder(), LeaveOneOutEncoder(), PolynomialEncoder(),
                               SumEncoder()]}

In [79]:
hrbr_gs_2 = GridSearchCV(hgbr_pipe_2,
                        params_2,
                        scoring = 'neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 8, 
                        verbose = 1)

In [80]:
hrbr_gs_2.fit(X_train, y_train)

Fitting 8 folds for each of 76 candidates, totalling 608 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 608 out of 608 | elapsed:  9.0min finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('col_tran',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'FCVC',
                                                                          'NCP',
                                                                          'CH2O',
                                                                          'FAF',
                                                                          'TUE']),
                                                                        ('encoder',
                                                                         EncoderSelector(),
                                                                         ['Gender',
     

In [81]:
hrbr_gs_2.best_score_

-9.021023398907372

In [82]:
hrbr_gs_2.best_params_

{'col_tran__encoder': HelmertEncoder(),
 'col_tran__scaler__with_mean': True,
 'col_tran__scaler__with_std': True}

In [83]:
hrbr_gs_2.best_estimator_

Pipeline(steps=[('col_tran',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Age', 'FCVC', 'NCP', 'CH2O',
                                                   'FAF', 'TUE']),
                                                 ('encoder', HelmertEncoder(),
                                                  ['Gender',
                                                   'family_history_with_overweight',
                                                   'FAVC', 'CAEC', 'SMOKE',
                                                   'SCC', 'CALC',
                                                   'MTRANS'])])),
                ('hgbt',
                 HistGradientBoostingRegressor(random_state=0,
                                               warm_start=True))])

# Grid Search on The HistGradientBoostingRegressor Loss & Early Stopping

In [84]:
hgbr_pipe_3 = hrbr_gs_2.best_estimator_

In [85]:
params_3 = {'hgbt__loss': ['least_squares', 'least_absolute_deviation', 'poisson'],
            'hgbt__early_stopping': ['auto', True, False]}

In [86]:
hgbr_gs_3 = GridSearchCV(hgbr_pipe_3,
                        params_3,
                        scoring = 'neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 8, 
                        verbose = 1)

In [87]:
hgbr_gs_3.fit(X_train, y_train)

Fitting 8 folds for each of 9 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.6min finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('col_tran',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'FCVC',
                                                                          'NCP',
                                                                          'CH2O',
                                                                          'FAF',
                                                                          'TUE']),
                                                                        ('encoder',
                                                                         HelmertEncoder(),
                                                                         ['Gender',
      

In [88]:
hgbr_gs_3.best_score_

-8.988745226149934

In [89]:
hgbr_gs_3.best_params_

{'hgbt__early_stopping': 'auto', 'hgbt__loss': 'poisson'}

In [90]:
hgbr_gs_3.best_estimator_

Pipeline(steps=[('col_tran',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Age', 'FCVC', 'NCP', 'CH2O',
                                                   'FAF', 'TUE']),
                                                 ('encoder', HelmertEncoder(),
                                                  ['Gender',
                                                   'family_history_with_overweight',
                                                   'FAVC', 'CAEC', 'SMOKE',
                                                   'SCC', 'CALC',
                                                   'MTRANS'])])),
                ('hgbt',
                 HistGradientBoostingRegressor(loss='poisson', random_state=0,
                                               warm_start=True))])

# Grid Search on All Remaining HistGradientBoostingRegressor Parameters

In [91]:
hgbr_pipe_4 = hgbr_gs_3.best_estimator_

In [95]:
params_4 = {'hgbt__learning_rate': [0.01, 0.1, 0.2],
            'hgbt__max_iter': [50, 100, 150],
           'hgbt__max_leaf_nodes': [30, 31, 32],
           'hgbt__max_depth': [None, 5, 10],
           'hgbt__min_samples_leaf': [19, 20, 21]}

In [96]:
hgbr_gs_4 = GridSearchCV(hgbr_pipe_4,
                        params_4,
                        scoring = 'neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 8, 
                        verbose = 1)

In [97]:
hgbr_gs_4.fit(X_train, y_train)

Fitting 8 folds for each of 243 candidates, totalling 1944 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 1944 out of 1944 | elapsed: 27.7min finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('col_tran',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'FCVC',
                                                                          'NCP',
                                                                          'CH2O',
                                                                          'FAF',
                                                                          'TUE']),
                                                                        ('encoder',
                                                                         HelmertEncoder(),
                                                                         ['Gender',
      

In [98]:
hgbr_gs_4.best_score_

-8.78159933608128

In [99]:
hgbr_gs_4.best_params_

{'hgbt__learning_rate': 0.2,
 'hgbt__max_depth': 10,
 'hgbt__max_iter': 150,
 'hgbt__max_leaf_nodes': 30,
 'hgbt__min_samples_leaf': 19}

In [100]:
hgbr_gs_4.best_estimator_

Pipeline(steps=[('col_tran',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Age', 'FCVC', 'NCP', 'CH2O',
                                                   'FAF', 'TUE']),
                                                 ('encoder', HelmertEncoder(),
                                                  ['Gender',
                                                   'family_history_with_overweight',
                                                   'FAVC', 'CAEC', 'SMOKE',
                                                   'SCC', 'CALC',
                                                   'MTRANS'])])),
                ('hgbt',
                 HistGradientBoostingRegressor(learning_rate=0.2,
                                               loss='poisson', max_depth=10,
                                               max_iter=150, max_leaf_nodes=30,
                                               min_samples_leaf=

# Continuation of Grid Search on All Remaining HistGradientBoostingRegressor Parameters

In [101]:
hgbr_pipe_5 = hgbr_gs_4.best_estimator_

In [102]:
params_5 = {'hgbt__learning_rate': [0.2, 0.3, 0.4],
            'hgbt__max_iter': [150, 200, 250],
           'hgbt__max_leaf_nodes': [28, 29, 30],
           'hgbt__max_depth': [10, 15, 20],
           'hgbt__min_samples_leaf': [17, 18, 19],
           'hgbt__l2_regularization': [0, 0.1, 0.2]}

In [103]:
hgbr_gs_5 = GridSearchCV(hgbr_pipe_5,
                        params_5,
                        scoring = 'neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 8, 
                        verbose = 1)

In [104]:
hgbr_gs_5.fit(X_train, y_train)

Fitting 8 folds for each of 729 candidates, totalling 5832 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 23.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 49.6min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 71.4min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 93.4min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 113.6min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 135.8min
[Parallel(n_jobs=-1)]: Done 5832 out of 5832 | elapsed: 156.7min finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('col_tran',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'FCVC',
                                                                          'NCP',
                                                                          'CH2O',
                                                                          'FAF',
                                                                          'TUE']),
                                                                        ('encoder',
                                                                         HelmertEncoder(),
                                                                         ['Gender',
      

In [105]:
hgbr_gs_5.best_score_

-8.752602839168855

In [106]:
hgbr_gs_5.best_params_

{'hgbt__l2_regularization': 0.1,
 'hgbt__learning_rate': 0.2,
 'hgbt__max_depth': 20,
 'hgbt__max_iter': 150,
 'hgbt__max_leaf_nodes': 30,
 'hgbt__min_samples_leaf': 19}

In [107]:
hgbr_gs_5.best_estimator_

Pipeline(steps=[('col_tran',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Age', 'FCVC', 'NCP', 'CH2O',
                                                   'FAF', 'TUE']),
                                                 ('encoder', HelmertEncoder(),
                                                  ['Gender',
                                                   'family_history_with_overweight',
                                                   'FAVC', 'CAEC', 'SMOKE',
                                                   'SCC', 'CALC',
                                                   'MTRANS'])])),
                ('hgbt',
                 HistGradientBoostingRegressor(l2_regularization=0.1,
                                               learning_rate=0.2,
                                               loss='poisson', max_depth=20,
                                               max_iter=150, max_leaf_node

# Continuation of Grid Search on All Remaining HistGradientBoostingRegressor Parameters

In [111]:
hgbr_pipe_6 = hgbr_gs_5.best_estimator_

In [109]:
params_6 = {'hgbt__max_depth': [20, 25, 30],
           'hgbt__l2_regularization': [0.01, 0.05, 0.1],
           'hgbt__max_bins': [200, 255, 300],
           'hgbt__scoring': ['loss', 'neg_mean_squared_error'],
           'hgbt__validation_fraction': [0.01, 0.1, 0.2],
           'hgbt__n_iter_no_change': [5, 10, 15],
           'hgbt__tol': [1e-8, 1e-7, 1e-6]}

In [112]:
hgbr_gs_6 = GridSearchCV(hgbr_pipe_6,
                        params_6,
                        scoring = 'neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 8, 
                        verbose = 1)

In [113]:
hgbr_gs_6.fit(X_train, y_train)

Fitting 8 folds for each of 1458 candidates, totalling 11664 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 21.4min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 35.0min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 45.3min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 57.4min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 63.5min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 71.6min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 87.1min
[Parallel(n_jobs=-1)]: Done 11242 tasks      |

GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('col_tran',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'FCVC',
                                                                          'NCP',
                                                                          'CH2O',
                                                                          'FAF',
                                                                          'TUE']),
                                                                        ('encoder',
                                                                         HelmertEncoder(),
                                                                         ['Gender',
      

In [114]:
hgbr_gs_6.best_score_

-8.750890576256486

In [115]:
hgbr_gs_6.best_params_

{'hgbt__l2_regularization': 0.1,
 'hgbt__max_bins': 255,
 'hgbt__max_depth': 25,
 'hgbt__n_iter_no_change': 5,
 'hgbt__scoring': 'loss',
 'hgbt__tol': 1e-08,
 'hgbt__validation_fraction': 0.01}

In [116]:
hgbr_gs_6.best_estimator_

Pipeline(steps=[('col_tran',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Age', 'FCVC', 'NCP', 'CH2O',
                                                   'FAF', 'TUE']),
                                                 ('encoder', HelmertEncoder(),
                                                  ['Gender',
                                                   'family_history_with_overweight',
                                                   'FAVC', 'CAEC', 'SMOKE',
                                                   'SCC', 'CALC',
                                                   'MTRANS'])])),
                ('hgbt',
                 HistGradientBoostingRegressor(l2_regularization=0.1,
                                               learning_rate=0.2,
                                               loss='poisson', max_depth=25,
                                               max_iter=150, max_leaf_node

# Continuation of Grid Search on All Remaining HistGradientBoostingRegressor Parameters

In [117]:
hgbr_pipe_7 = hgbr_gs_6.best_estimator_

In [120]:
params_7 = {'hgbt__validation_fraction': [0.0001, 0.0005, 0.001, 0.005, 0.01,],
           'hgbt__n_iter_no_change': [1, 2, 3, 4, 5],
           'hgbt__tol': [1e-12, 1e-11, 1e-10, 1e-9, 1e-8]}

In [121]:
hgbr_gs_7 = GridSearchCV(hgbr_pipe_7,
                        params_7,
                        scoring = 'neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 8, 
                        verbose = 1)

In [122]:
hgbr_gs_7.fit(X_train, y_train)

Fitting 8 folds for each of 125 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 20.7min finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('col_tran',
                                        ColumnTransformer(transformers=[('scaler',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'FCVC',
                                                                          'NCP',
                                                                          'CH2O',
                                                                          'FAF',
                                                                          'TUE']),
                                                                        ('encoder',
                                                                         HelmertEncoder(),
                                                                         ['Gender',
      

In [123]:
hgbr_gs_7.best_score_

-8.750890576256486

In [124]:
hgbr_gs_7.best_params_

{'hgbt__n_iter_no_change': 1,
 'hgbt__tol': 1e-12,
 'hgbt__validation_fraction': 0.0001}

In [125]:
hgbr_gs_7.best_estimator_

Pipeline(steps=[('col_tran',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Age', 'FCVC', 'NCP', 'CH2O',
                                                   'FAF', 'TUE']),
                                                 ('encoder', HelmertEncoder(),
                                                  ['Gender',
                                                   'family_history_with_overweight',
                                                   'FAVC', 'CAEC', 'SMOKE',
                                                   'SCC', 'CALC',
                                                   'MTRANS'])])),
                ('hgbt',
                 HistGradientBoostingRegressor(l2_regularization=0.1,
                                               learning_rate=0.2,
                                               loss='poisson', max_depth=25,
                                               max_iter=150, max_leaf_node

# Evaluation of Final Model

In [131]:
hgbr_pipe_8 = hgbr_gs_7.best_estimator_

In [133]:
hgbr_pipe_8.score(X_train, y_train)

0.9900348802013096

In [134]:
hgbr_pipe_8.score(X_test, y_test)

0.8740563671385658

In [142]:
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted)) 

In [138]:
y_pred_train = hgbr_pipe_8.predict(X_train)
y_pred_test = hgbr_pipe_8.predict(X_test)

In [143]:
rmse(y_train, y_pred_train)

0.8003531307407434

In [144]:
rmse(y_test, y_pred_test)

2.833798100983141

In [145]:
def adj_r2(model, X, y):
    return 1 - (1 - model.score(X, y)) * (len(y) - 1)/(len(y) - X.shape[1] - 1)

In [146]:
adj_r2(hgbr_pipe_8, X_train, y_train)

0.9899458491373775

In [147]:
adj_r2(hgbr_pipe_8, X_test, y_test)

0.870619308931821