**Random Forest Regressor**

In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
from xgboost import XGBRFRegressor
# from utilities import cross_val_metrics_calculate

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, mean_absolute_percentage_error
from pandas import DataFrame, Series
def cross_val_metrics_calculate(model, X:DataFrame, y:Series, splits, metrics=['mse', 'rmse', 'mae', 'mape']):
    n_folds = 0
    result = {name:0 for name in metrics}
    for train_index, test_index in splits:
        n_folds += 1
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if 'mse' in metrics:
            result['mse'] += mean_squared_error(y_test, y_pred)
        if 'rmse' in metrics:
            result['rmse'] += root_mean_squared_error(y_test, y_pred)
        if 'mae' in metrics:
            result['mae'] += mean_absolute_error(y_test, y_pred)
        if 'mape' in metrics:
            result['mape'] += mean_absolute_percentage_error(y_test, y_pred)
    for metric in metrics:
        result[metric] /= n_folds
    return result

In [9]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/train_data.csv")
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [10]:
feature_names = X.columns
feature_names

Index(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'],
      dtype='object')

In [11]:
n_folds = 5
kfold = KFold(n_folds)

In [25]:
rf_search = GridSearchCV(XGBRFRegressor(device='cpu', eval_metric=mean_absolute_error,
                                        booster='gbtree'),
                         param_grid={'n_estimators':[100,250,400],
                                    #  'criterion':['squared_error'],
                                     'max_depth':[4,6,8,10],
                                    #  'max_features':[2,3,4],
                                    #  'max_samples':[0.3,0.4,0.5,0.6]
                                     },
                         scoring=['neg_mean_squared_error',
                                  'neg_root_mean_squared_error',
                                  'neg_mean_absolute_error',
                                  'neg_mean_absolute_percentage_error'],
                         cv=5,
                         refit='neg_mean_absolute_error')

rf_search.fit(X,y)

In [20]:
rf = rf_search.best_estimator_

cv_results = cross_val_metrics_calculate(rf, X, y, kfold.split(X))
print(cv_results)

{'mse': 451.37932178030235, 'rmse': 21.01108045515797, 'mae': 5.475447701302623, 'mape': 2.94632392658882}


In [21]:
rf.fit(X, y)
for i in range(rf.n_features_in_):
  print("%s: %.10f" % (rf.feature_names_in_[i], rf.feature_importances_[i]))

Area (m2): 0.0884456038
Property Type: 0.0992125049
Bedrooms: 0.0540109351
Bathrooms: 0.0485218801
Address: 0.0430482924
Law Document: 0.0598586947
Quarter: 0.0443994664
Year: 0.0382630825
Latitude: 0.0543404706
Longitude: 0.0653982684
Postal Code: 0.0975563601
Importance: 0.0367766209
Place Rank: 0.2128586620
City: 0.0573091395


In [22]:
rf_search_s = make_pipeline(StandardScaler(), rf_search)

rf_search_s.fit(X, y)

In [23]:
rf_with_standardize = make_pipeline(StandardScaler(), rf_search.best_estimator_)

cv_results = cross_val_metrics_calculate(rf_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 453.5078395659042, 'rmse': 21.05564552091834, 'mae': 5.4730329192739635, 'mape': 2.9477682852520397}


In [24]:
rf_with_standardize.fit(X, y)
for i in range(rf_with_standardize.named_steps['xgbrfregressor'].n_features_in_):
  print("%s: %.10f" % (feature_names[i], rf_with_standardize.named_steps['xgbrfregressor'].feature_importances_[i]))

Area (m2): 0.0889889449
Property Type: 0.0983288214
Bedrooms: 0.0552871302
Bathrooms: 0.0483751744
Address: 0.0452639423
Law Document: 0.0573495887
Quarter: 0.0462819077
Year: 0.0410987400
Latitude: 0.0582178719
Longitude: 0.0680897012
Postal Code: 0.0986356586
Importance: 0.0432461873
Place Rank: 0.1908636093
City: 0.0599726774


- Standardization is not required for Random Forest, no difference in results
- Area and property type has high importance values, place rank stands out

In [26]:
# save model: reconsider using rf from sklearn
rf_search.best_estimator_.save_model('RandomForest.json')