# GRADIENT BOOSTING AND RANDOM FOREST MODELS

### Importing relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_validate, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, mean_squared_error, root_mean_squared_error

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.tree import DecisionTreeRegressor

In [2]:
train_data = pd.read_csv(r'playground-series-s5e5\train.csv')
test_data = pd.read_csv(r'playground-series-s5e5\test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [4]:
train_data.columns

Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories'],
      dtype='object')

In [5]:
num_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
cat_cols = ['Sex']

In [6]:
cat_encoder = OneHotEncoder(handle_unknown='ignore')

column_transformer = ColumnTransformer(
    [
        ('cat_encoding', cat_encoder, cat_cols),
        ('num_passthrough', 'passthrough', num_cols)
    ],
    remainder = 'drop',
    verbose_feature_names_out=False,
    sparse_threshold=0
)

In [7]:
X = column_transformer.fit_transform(train_data)
X

array([[  0. ,   1. ,  36. , ...,  26. , 101. ,  41. ],
       [  1. ,   0. ,  64. , ...,   8. ,  85. ,  39.7],
       [  1. ,   0. ,  51. , ...,   7. ,  84. ,  39.8],
       ...,
       [  0. ,   1. ,  60. , ...,  29. , 113. ,  40.9],
       [  0. ,   1. ,  45. , ...,  17. , 102. ,  40.3],
       [  1. ,   0. ,  39. , ...,  19. ,  97. ,  40.6]], shape=(750000, 8))

In [8]:
X.shape

(750000, 8)

In [9]:
train_data.describe(include = 'all')

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,750000.0,750000,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
unique,,2,,,,,,,
top,,female,,,,,,,
freq,,375721,,,,,,,
mean,374999.5,,41.420404,174.697685,75.145668,15.421015,95.483995,40.036253,88.282781
std,216506.495284,,15.175049,12.824496,13.982704,8.354095,9.449845,0.779875,62.395349
min,0.0,,20.0,126.0,36.0,1.0,67.0,37.1,1.0
25%,187499.75,,28.0,164.0,63.0,8.0,88.0,39.6,34.0
50%,374999.5,,40.0,174.0,74.0,15.0,95.0,40.3,77.0
75%,562499.25,,52.0,185.0,87.0,23.0,103.0,40.7,136.0


## Bagging Regressor

In [10]:
bagging_regressor_model = BaggingRegressor(
    estimator = DecisionTreeRegressor(),
    n_estimators=300,
    bootstrap=True,
    max_features= 1,
    max_samples=1,
    random_state = 0
)

bagging_regressor_pipeline = Pipeline(
    [
        ('preprocessing', column_transformer),
        ('modeling', bagging_regressor_model)
    ]
)

cv = KFold(n_splits=5, shuffle = True, random_state = 0)
bagging_regressor_rmse = make_scorer(root_mean_squared_error, greater_is_better=False)

bagging_regressor_scores = cross_validate(
    estimator = bagging_regressor_pipeline,
    X = train_data,
    y = train_data['Calories'],
    cv = cv,
    scoring = bagging_regressor_rmse,
    return_estimator=True,
    return_train_score=True
)

In [11]:
np.mean(bagging_regressor_scores['test_score'])

np.float64(-62.50458351075499)

## Random Forests model

In [12]:
random_forest_model = RandomForestRegressor(
    n_estimators=300,
    min_samples_leaf=3,
    random_state = 0
)

random_forest_pipeline = Pipeline(
    [
        ('preprocessing', column_transformer),
        ('modeling', random_forest_model)
    ]
)

cv = KFold(n_splits=5, shuffle = True, random_state = 0)
random_forest_rmse = make_scorer(root_mean_squared_error, greater_is_better=False)

random_forest_scores = cross_validate(
    estimator = random_forest_pipeline,
    X = train_data,
    y = train_data['Calories'],
    cv = cv,
    scoring = random_forest_rmse,
    return_estimator=True,
    return_train_score=True
)

In [13]:
np.mean(random_forest_scores['test_score'])

np.float64(-3.741669526801928)

In [14]:
np.mean(random_forest_scores['fit_time'])

np.float64(683.2878516197204)

## Gradient Boosting

In [15]:
gradient_boosting_model = GradientBoostingRegressor(
    n_estimators=300,
    min_samples_leaf=3,
    max_depth=4,
    learning_rate=0.1,
    random_state = 0
)

gradient_boosting_pipeline = Pipeline(
    [
        ('preprocessing', column_transformer),
        ('modeling', gradient_boosting_model)
    ]
)

cv = KFold(n_splits=5, shuffle = True, random_state = 0)
gradient_boosting_rmse = make_scorer(root_mean_squared_error, greater_is_better=False)

gradient_boosting_scores = cross_validate(
    estimator = gradient_boosting_pipeline,
    X = train_data,
    y = train_data['Calories'],
    cv = cv,
    scoring = gradient_boosting_rmse,
    return_estimator=True,
    return_train_score=True
)

In [16]:
np.mean(gradient_boosting_scores['test_score'])

np.float64(-3.7786591404846077)

## Gradient boosting with early stopping

In [17]:
gradient_boosting_model2 = GradientBoostingRegressor(
    n_estimators=1000,
    min_samples_leaf=3,
    max_depth=4,
    learning_rate=0.1,
    random_state = 0,
    validation_fraction=0.2,
    n_iter_no_change=10
)

gradient_boosting_pipeline2 = Pipeline(
    [
        ('preprocessing', column_transformer),
        ('modeling', gradient_boosting_model2)
    ]
)

cv = KFold(n_splits=5, shuffle = True, random_state = 0)
gradient_boosting_rmse2 = make_scorer(root_mean_squared_error, greater_is_better=False)

gradient_boosting_scores = cross_validate(
    estimator = gradient_boosting_pipeline2,
    X = train_data,
    y = train_data['Calories'],
    cv = cv,
    scoring = gradient_boosting_rmse2,
    return_estimator=True,
    return_train_score=True
)

In [18]:
np.mean(gradient_boosting_scores['test_score'])

np.float64(-3.6776787201731573)

## Saving model for productionization

#### we used the gradient boosting model for good results with early stopping to proevent over fitting

In [19]:
import joblib

In [20]:
estimator = gradient_boosting_scores['estimator'][0]

In [24]:
joblib.dump(estimator, 'calorie_model.pkl')

['calorie_model.pkl']

In [23]:
gradient_boosting_scores['test_score']

array([-3.62349284, -3.71092553, -3.70255619, -3.68134316, -3.67007588])