# Random Forest Regressor

# Libraries and functions

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
import pickle
from utilities import cross_val_metrics_calculate

In [4]:
data = pd.read_csv("../data/train_data_2nd.csv")
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [None]:
# data for property type classification
y = data['Property Type']
X = data.loc[:, data.columns != 'Property Type'].iloc[:, 1:]

In [5]:
feature_names = X.columns
feature_names

Index(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'],
      dtype='object')

In [6]:
n_folds = 5
kfold = KFold(n_folds)

# Model

In [None]:
rf_search = GridSearchCV(
    RandomForestRegressor(
        max_samples=0.65
    ),
    param_grid={
        'n_estimators':[100,250,400,500],
        'max_depth':[6,8,10,12],
        'max_features':['log2', 2, 1.0]
    },
    scoring=[
        'neg_mean_absolute_error',
        'neg_mean_absolute_percentage_error'
    ],
    cv=5,
    refit='neg_mean_absolute_percentage_error'
    )

rf_search.fit(X,y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
rf_search.best_params_

{'max_depth': 12, 'max_features': 'log2', 'n_estimators': 250}

In [7]:
# rf = rf_search.best_estimator_
rf = RandomForestRegressor(
    max_samples=0.65,
    max_depth=12,
    max_features='log2',
    n_estimators=250
)

cv_results = cross_val_metrics_calculate(rf, X, y, kfold.split(X))
print(cv_results)

{'mse': 396.5580633489286, 'rmse': 19.6848666425196, 'mae': 5.354206440707167, 'mape': 2.9471713387197185, 'medae': 1.8873516377852575, 'medape': 0.3294293733615456}


In [None]:
rf.fit(X, y)
for i in range(rf.n_features_in_):
  print("%s: %.10f" % (rf.feature_names_in_[i], rf.feature_importances_[i]))

Area (m2): 0.2846038597
Property Type: 0.0561836332
Bedrooms: 0.0628896292
Bathrooms: 0.0640492535
Address: 0.0918484116
Law Document: 0.0363139084
Quarter: 0.0399545318
Year: 0.0505458143
Latitude: 0.0859400731
Longitude: 0.1013971579
Postal Code: 0.0680579908
Importance: 0.0271814152
Place Rank: 0.0217572834
City: 0.0092770378


- Important features: Area; longitude, address, latitude; bedrooms, bathrooms, postal code; property type

In [None]:
import pickle

pickle.dump(rf, open('../models/RandomForest.h5', 'wb'))

**With standardize**

In [None]:
rf_search_s = make_pipeline(StandardScaler(), rf_search)

rf_search_s.fit(X, y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
rf_search.best_params_

{'max_depth': 12, 'max_features': 'log2', 'n_estimators': 400}

In [8]:
rf_with_standardize = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(
        max_samples=0.65,
        max_depth=12,
        max_features='log2',
        n_estimators=400
    )
)

cv_results = cross_val_metrics_calculate(rf_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 397.55526898098555, 'rmse': 19.71303057521942, 'mae': 5.354426492349328, 'mape': 2.950342730475163, 'medae': 1.8899478263553302, 'medape': 0.3298295518116148}


In [None]:
rf_with_standardize.fit(X, y)
for i in range(len(feature_names)):
  print("%s: %.10f" % (feature_names[i], rf_with_standardize.named_steps['randomforestregressor'].feature_importances_[i]))

Area (m2): 0.2815982270
Property Type: 0.0563042943
Bedrooms: 0.0675227708
Bathrooms: 0.0638150813
Address: 0.0907336054
Law Document: 0.0363792350
Quarter: 0.0399820224
Year: 0.0496791979
Latitude: 0.0872097589
Longitude: 0.0987501402
Postal Code: 0.0703555796
Importance: 0.0302207328
Place Rank: 0.0166589234
City: 0.0107904311


- No difference with data standardization

# Random Forest Classifier for Property Type

In [None]:
rf_search = GridSearchCV(
    RandomForestClassifier(
        max_samples=0.65
    ),
    param_grid={
        'n_estimators':[100,250,400,500],
        'criterion':['gini', 'entropy'],
        'max_depth':[6,8,10,12],
        'max_features':['log2', 2, 1.0]
    },
    scoring=[
        'accuracy'
    ],
    cv=5,
    refit='accuracy'
    )

rf_search.fit(X,y)

In [None]:
rf = rf_search.best_estimator_

cv_results = cross_val_metrics_calculate(rf, X, y, kfold.split(X), metrics=['accuracy', 'precision', 'recall', 'f1'])
print(cv_results)

{'accuracy': 0.8093244653314711, 'precision': 0.8043808501450913, 'recall': 0.7239889777200222, 'f1': 0.751153804079045}


In [None]:
rf.fit(X, y)
for i in range(rf.n_features_in_):
  print("%s: %.10f" % (rf.feature_names_in_[i], rf.feature_importances_[i]))

Area (m2): 0.1773731643
Bedrooms: 0.0956968692
Bathrooms: 0.0665962007
Address: 0.0405357993
Law Document: 0.0767959941
Quarter: 0.0242815421
Year: 0.0651572953
Latitude: 0.0773068086
Longitude: 0.0940729148
Postal Code: 0.0471146842
Importance: 0.0137227394
Place Rank: 0.0099960155
City: 0.0347269206
Price (billion VND): 0.1766230519


- Area and price matter most
- Importances mostly similar to previous models

In [None]:
pickle.dump(rf, open('../models/RandomForest_Classifier.h5', 'wb'))

**With standardize**

In [None]:
rf_search = GridSearchCV(
    RandomForestClassifier(
        max_samples=0.65
    ),
    param_grid={
        'n_estimators':[100,250,400,500],
        'criterion':['gini', 'entropy'],
        'max_depth':[6,8,10,12],
        'max_features':['log2', 2, 1.0]
    },
    scoring=[
        'accuracy'
    ],
    cv=5,
    refit='accuracy'
)
rf_search_s = make_pipeline(StandardScaler(), rf_search)

rf_search_s.fit(X, y)

In [None]:
rf_with_standardize = make_pipeline(StandardScaler(), rf_search.best_estimator_)

cv_results = cross_val_metrics_calculate(rf_with_standardize, X, y, kfold.split(X), metrics=['accuracy', 'precision', 'recall', 'f1'])
print(cv_results)

{'accuracy': 0.8081822315205189, 'precision': 0.8034844151401346, 'recall': 0.7186746056991179, 'f1': 0.7456018927163572}


In [None]:
rf_with_standardize.fit(X, y)
for i in range(len(feature_names)):
  print("%s: %.10f" % (feature_names[i], rf_with_standardize.named_steps['randomforestclassifier'].feature_importances_[i]))

Area (m2): 0.1795678868
Bedrooms: 0.0955704014
Bathrooms: 0.0651002634
Address: 0.0405996468
Law Document: 0.0738946040
Quarter: 0.0226870922
Year: 0.0630614793
Latitude: 0.0776137846
Longitude: 0.0948521883
Postal Code: 0.0463956852
Importance: 0.0140858138
Place Rank: 0.0103672649
City: 0.0353203429
Price (billion VND): 0.1808835463


- Similar accuracy and precision, but lower recall and F1 scores
- Feature importances similar