**Gradient Boosting Regressor**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
# from xgboost import XGBRegressor, QuantileDMatrix

In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, mean_absolute_percentage_error
from pandas import DataFrame, Series
def cross_val_metrics_calculate(model, X:DataFrame, y:Series, splits, metrics=['mse', 'rmse', 'mae', 'mape']):
    n_folds = 0
    result = {name:0 for name in metrics}
    for train_index, test_index in splits:
        n_folds += 1
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if 'mse' in metrics:
            result['mse'] += mean_squared_error(y_test, y_pred)
        if 'rmse' in metrics:
            result['rmse'] += root_mean_squared_error(y_test, y_pred)
        if 'mae' in metrics:
            result['mae'] += mean_absolute_error(y_test, y_pred)
        if 'mape' in metrics:
            result['mape'] += mean_absolute_percentage_error(y_test, y_pred)
    for metric in metrics:
        result[metric] /= n_folds
    return result

In [3]:
data = pd.read_csv("data/train_data.csv")
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [4]:
feature_names = X.columns
feature_names

Index(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'],
      dtype='object')

In [5]:
n_folds = 5
kfold = KFold(n_folds)

In [22]:
gb_search = GridSearchCV(GradientBoostingRegressor(),
                         param_grid={'loss':['absolute_error'],
                                     'n_estimators':[100,200,250,300],
                                     'learning_rate':[0.05, 0.1, 0.25, 0.5],
                                     'max_depth':[3,5,7],
                                     'max_features':['sqrt','log2'],
                                     'n_iter_no_change':[3]},
                         scoring=['neg_mean_absolute_error',
                                  'neg_mean_absolute_percentage_error'],
                         cv=5,
                         refit='neg_mean_absolute_percentage_error')

gb_search.fit(X, y)

In [23]:
gb_search.best_params_

{'learning_rate': 0.05,
 'loss': 'absolute_error',
 'max_depth': 3,
 'max_features': 'log2',
 'n_estimators': 100,
 'n_iter_no_change': 3}

In [7]:
# gb = gb_search.best_estimator_
gb = GradientBoostingRegressor(learning_rate=0.05, loss='absolute_error', max_depth=3, max_features='log2', n_estimators=100, n_iter_no_change=3)

cv_results = cross_val_metrics_calculate(gb, X, y, kfold.split(X))
print(cv_results)

{'mse': 575.3381666761496, 'rmse': 23.83952876400973, 'mae': 5.877483838263438, 'mape': 1.9230780091012671}


In [8]:
gb.fit(X, y)
for i in range(gb.n_features_in_):
  print(f"{feature_names[i]}: {gb.feature_importances_[i]}")

Area (m2): 0.27556581232191796
Property Type: 0.24014039798424405
Bedrooms: 0.14619465580161328
Bathrooms: 0.06811812623020393
Address: 0.007878508245927038
Law Document: 0.013213829583237663
Quarter: 0.008415449673795402
Year: 0.07450680390453093
Latitude: 0.0623294128569212
Longitude: 0.0686762544446676
Postal Code: 0.028377837988310447
Importance: 0.0025504617337442706
Place Rank: 0.0011191676308018187
City: 0.002913281600084314


- Compared to SVM: higher MAPE (1.95 vs 1.58) but lower MAE (5.85 vs 7) -> generalize better between lower-priced and higher-priced estates
- Area and property type has high importances, along with bedrooms and bathrooms num.

In [9]:
import pickle

pickle.dump(gb, open("models/GradientBoosting.h5", 'wb'))

In [28]:
gb_search_s = make_pipeline(StandardScaler(), gb_search)

gb_search_s.fit(X, y)

In [29]:
gb_search_s.named_steps['gridsearchcv'].best_params_

{'learning_rate': 0.05,
 'loss': 'absolute_error',
 'max_depth': 3,
 'max_features': 'log2',
 'n_estimators': 100,
 'n_iter_no_change': 3}

In [30]:
gb_with_standardize = make_pipeline(StandardScaler(), gb_search.best_estimator_)

cv_results = cross_val_metrics_calculate(gb_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 588.2626110063326, 'rmse': 24.10705561606168, 'mae': 6.266169650612879, 'mape': 1.844347691117753}


In [31]:
gb_with_standardize.fit(X, y)
for i in range(gb_with_standardize.n_features_in_):
  print(f"{feature_names[i]}: {gb_with_standardize.named_steps['gradientboostingregressor'].feature_importances_[i]}")

Area (m2): 0.026383689854869972
Property Type: 0.30031827619900486
Bedrooms: 0.20805857227441904
Bathrooms: 0.1262688771964175
Address: 0.012339813355326107
Law Document: 0.016024898994541275
Quarter: 0.014136506474814537
Year: 0.11242723013327001
Latitude: 0.07271069917471656
Longitude: 0.07117988687231458
Postal Code: 0.028452039443069125
Importance: 0.0018469601494460518
Place Rank: 0.002358818814498905
City: 0.007493731063291425


With standardized data:
- MAE increase but MAPE decrease
- Property type, bedrooms and bathrooms num. have more importance

In [32]:
pickle.dump(gb_with_standardize, open("GradientBoosting.h5", 'wb'))

*Gradient Boosting using XGBoost*

In [14]:
xgb_search = GridSearchCV(XGBRegressor(booster='gbtree', tree_method='hist',
                                       eval_metric=mean_absolute_error,
                                       device='cuda'),
                         param_grid={'n_estimators':[300,400,500],
                                     'learning_rate':[0.01, 0.025, 0.05, 0.1, 0.25],
                                     'max_depth':[7,10,12]},
                         scoring=['neg_mean_absolute_error',
                                  'neg_mean_absolute_percentage_error'],
                         cv=5,
                         refit='neg_mean_absolute_error')

xgb_search.fit(X, y)

In [15]:
xgb = xgb_search.best_estimator_

cv_results = cross_val_metrics_calculate(xgb, X, y, kfold.split(X))
print(cv_results)

{'mse': 496.567111526054, 'rmse': 22.02206413616195, 'mae': 5.159781568928723, 'mape': 2.5358050310274587}


In [16]:
xgb.fit(X, y)

for i in range(xgb.n_features_in_):
  print(f"{feature_names[i]}: {xgb.feature_importances_[i]}")

Area (m2): 0.04499581456184387
Property Type: 0.04802629351615906
Bedrooms: 0.02795402891933918
Bathrooms: 0.025360649451613426
Address: 0.025616994127631187
Law Document: 0.04113392159342766
Quarter: 0.026327094063162804
Year: 0.026959847658872604
Latitude: 0.04017285257577896
Longitude: 0.06001024320721626
Postal Code: 0.04385070502758026
Importance: 0.010864345356822014
Place Rank: 0.4750921428203583
City: 0.10363508015871048


- Worse than GB by sklearn (?)
- Place rank: 0.475 -> output heavily dependent on place rank?

In [17]:
xgb.save_model("XGBoostRegressor.json")

In [18]:
xgb_search_s = make_pipeline(StandardScaler(), xgb_search)

xgb_search_s.fit(X, y)

In [20]:
xgb_with_standardize = make_pipeline(StandardScaler(), xgb_search.best_estimator_)

cv_results = cross_val_metrics_calculate(xgb_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 489.7284955073684, 'rmse': 21.844304784468537, 'mae': 5.160538590975572, 'mape': 2.580785552748399}


In [21]:
xgb_with_standardize.fit(X, y)

for i in range(xgb_with_standardize.named_steps['xgbregressor'].n_features_in_):
  print(f"{feature_names[i]}: {xgb_with_standardize.named_steps['xgbregressor'].feature_importances_[i]}")

Area (m2): 0.055799081921577454
Property Type: 0.058508291840553284
Bedrooms: 0.030510105192661285
Bathrooms: 0.028770802542567253
Address: 0.03012879192829132
Law Document: 0.059577617794275284
Quarter: 0.028082070872187614
Year: 0.03282705694437027
Latitude: 0.04321756958961487
Longitude: 0.06522510200738907
Postal Code: 0.04847514256834984
Importance: 0.011123117059469223
Place Rank: 0.42480170726776123
City: 0.082953542470932


- No difference with standardization