# Random Forest Regression
____

#### Libraries
____

In [48]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

import statsmodels.formula.api as sm
from sklearn.model_selection import cross_val_score
from sklearn import metrics


%matplotlib inline

#### Load dataset
___

In [16]:
dataset = '../../../data/bpd/build_clean.csv'
building = pd.read_csv(dataset)
building.head(5)

Unnamed: 0,bname,build_year,floor_area,occupy,water_use,ws_ngas,ws_elec,site_eui
0,Multifamily,1903,412503.0,100,22061.0,0.0,3249755.2,32.0
1,General Medical,1950,567517.0,100,9574.0,2.2,18295844.6,352.7
2,Multifamily,1939,321464.0,100,5915.6,0.0,1121931.8,76.2
3,Multifamily,1911,54913.0,100,231.9,0.0,262377.2,17.9
4,Multifamily,1964,535632.0,100,8984.2,1.2,1620.0,115.9


In [18]:
building.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6529 entries, 0 to 6528
Data columns (total 8 columns):
bname         6529 non-null object
build_year    6529 non-null int64
floor_area    6529 non-null float64
occupy        6529 non-null int64
water_use     6529 non-null float64
ws_ngas       6529 non-null float64
ws_elec       6529 non-null float64
site_eui      6529 non-null float64
dtypes: float64(5), int64(2), object(1)
memory usage: 408.1+ KB


In [17]:
building.describe()

Unnamed: 0,build_year,floor_area,occupy,water_use,ws_ngas,ws_elec,site_eui
count,6529.0,6529.0,6529.0,6529.0,6529.0,6529.0,6529.0
mean,1949.651095,136859.9,99.152244,16030.62,1.977179,1476976.0,260.574575
std,30.901407,171146.0,5.532777,157196.0,116.266859,9362351.0,10897.697301
min,1649.0,500.0,0.0,0.0,0.0,0.0,0.0
25%,1927.0,64677.0,100.0,2688.4,0.1,287372.9,64.4
50%,1941.0,87036.0,100.0,4786.8,0.5,485308.3,79.2
75%,1967.0,138728.0,100.0,8001.2,0.7,1058213.0,96.8
max,2019.0,2376683.0,100.0,6594604.0,9393.0,452023600.0,869265.0


#### Converting Categorical Features
___

In [19]:
bname_categorical = pd.get_dummies(building['bname'])

# Avoiding the Dummy Variable Trap
bname_categorical.drop(bname_categorical.columns[0], axis=1, inplace=True)

# Merge datasets
building.drop(['bname'], axis=1, inplace=True)
building = pd.concat([bname_categorical, building], axis=1)

building.head(5)

Unnamed: 0,Entertainment,Facility,Food Service,Gas Station,General Medical,Hotel,Multifamily,Office,Other,Parking,Retail,build_year,floor_area,occupy,water_use,ws_ngas,ws_elec,site_eui
0,0,0,0,0,0,0,1,0,0,0,0,1903,412503.0,100,22061.0,0.0,3249755.2,32.0
1,0,0,0,0,1,0,0,0,0,0,0,1950,567517.0,100,9574.0,2.2,18295844.6,352.7
2,0,0,0,0,0,0,1,0,0,0,0,1939,321464.0,100,5915.6,0.0,1121931.8,76.2
3,0,0,0,0,0,0,1,0,0,0,0,1911,54913.0,100,231.9,0.0,262377.2,17.9
4,0,0,0,0,0,0,1,0,0,0,0,1964,535632.0,100,8984.2,1.2,1620.0,115.9


#### Define X, Y variables
___

In [20]:
X = building.drop('site_eui', axis=1)
Y = building['site_eui']

print (X.shape, Y.shape)

(6529, 17) (6529,)


#### Split dataset (TRAIN/TEST) data
____

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 101)

#### Fitting model to the dataset
____

In [39]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 101)
regressor.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=101, verbose=0, warm_start=False)

#### Fitting Simple Linear Regression to the Training Set
_____

In [40]:
y_prediction = regressor.predict(X_test)

#### Real x Predict
_____

In [41]:
pd.DataFrame({
    'Real' : Y_test,
    'Predict' : y_prediction

}).head(10)

Unnamed: 0,Real,Predict
1233,66.5,64.348
1439,57.7,61.094
4067,57.1,55.945
5867,44.1,54.15
1222,115.9,111.952
2678,108.4,96.069
4551,54.4,57.22
1722,78.0,79.06
6101,163.1,133.165
4206,84.2,81.704


#### Model Score
__________

In [42]:
print(f"Training Data Score: {regressor.score(X_train, Y_train)}")
print(f"Testing Data Score: {regressor.score(X_test, Y_test)}")

Training Data Score: 0.8589991468714256
Testing Data Score: 0.8375512173204313


In [49]:
print('MAE:', metrics.mean_absolute_error(Y_test, y_prediction))
print('MSE:', metrics.mean_squared_error(Y_test, y_prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, y_prediction)))

MAE: 22.981392802450248
MSE: 91731.90499623676
RMSE: 302.8727538030398


#### Applying K-Fold Cross Validation
_____

In [43]:
accuracies = cross_val_score(estimator = regressor, X = X_train, y = Y_train, cv = 10)

In [44]:
print (accuracies)

[-2.77032019e+03  8.78698768e-01 -5.18506448e-03  1.17489336e-02
 -3.47176321e+00  9.64564547e-01 -1.82023909e-03 -9.68410726e+01
  9.79105598e-01 -4.32094825e+00]


In [45]:
accuracies.mean()

-287.21268638033115

In [46]:
accuracies.std()

828.2013205724754