# Random Forest Regression
____

#### Libraries
____

In [54]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

import statsmodels.formula.api as sm
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import pickle


%matplotlib inline

#### Load dataset
___

In [74]:
dataset = 'data/com.csv'
# building = pd.read_csv(dataset)
# building.head(5)
columns = ['ptype', 'floora_plan', 'year', 'occ', 'site_EUI', 'n_site_gas','n_site_elec', 'water_use']
# columns = ['ptype', 'year', 'occ','site_EUI', 'n_site_gas','n_site_elec','water_use']
building = pd.read_csv(dataset, sep=',', usecols=columns)

building = building[(building != 0).all(1)]
building.head(5)

Unnamed: 0,ptype,floora_plan,year,occ,site_EUI,n_site_gas,n_site_elec,water_use
0,Multifamily,412503.0,1903,100,32.0,20472.0,3249755.2,22061.0
1,General Medical,567517.0,1950,100,352.7,1255514.1,18295844.6,9574.0
2,Multifamily,321464.0,1939,100,76.2,8906.3,1121931.8,5915.6
3,Multifamily,54913.0,1911,100,17.9,601.0,262377.2,231.9
4,Multifamily,535632.0,1964,100,115.9,631631.4,1620.0,8984.2


In [75]:
# building.info()

In [76]:
# building.describe()

#### Converting Categorical Features
___

In [77]:
bname_categorical = pd.get_dummies(building['ptype'])

# Avoiding the Dummy Variable Trap
bname_categorical.drop(bname_categorical.columns[0], axis=1, inplace=True)

# Merge datasets
building.drop(['ptype'], axis=1, inplace=True)
building = pd.concat([bname_categorical, building], axis=1)

building.head(5)

Unnamed: 0,Entertainment,Facility,Food Service,General Medical,Hotel,Multifamily,Office,Other,Parking,Retail,floora_plan,year,occ,site_EUI,n_site_gas,n_site_elec,water_use
0,0,0,0,0,0,1,0,0,0,0,412503.0,1903,100,32.0,20472.0,3249755.2,22061.0
1,0,0,0,1,0,0,0,0,0,0,567517.0,1950,100,352.7,1255514.1,18295844.6,9574.0
2,0,0,0,0,0,1,0,0,0,0,321464.0,1939,100,76.2,8906.3,1121931.8,5915.6
3,0,0,0,0,0,1,0,0,0,0,54913.0,1911,100,17.9,601.0,262377.2,231.9
4,0,0,0,0,0,1,0,0,0,0,535632.0,1964,100,115.9,631631.4,1620.0,8984.2


#### Define X, Y variables
___

In [78]:
X = building.drop('site_EUI', axis=1)
Y = building['site_EUI']

print (X.shape, Y.shape)

(6449, 16) (6449,)


#### Split dataset (TRAIN/TEST) data
____

In [79]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 101)

#### Fitting model to the dataset
____

In [80]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 101)
regressor.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=101, verbose=0, warm_start=False)

#### Fitting Simple Linear Regression to the Training Set
_____

In [81]:
y_prediction = regressor.predict(X_test)

### weight
___

In [82]:
# We can sort the features by their importance
sorted(zip(regressor.feature_importances_, X_train.columns), reverse=True)

[(0.8448529924061761, 'n_site_gas'),
 (0.14211530807485698, 'n_site_elec'),
 (0.011018280341081323, 'floora_plan'),
 (0.001186546518710038, 'Hotel'),
 (0.00039083377010861586, 'water_use'),
 (0.0002678344404519974, 'year'),
 (7.075260920247117e-05, 'Multifamily'),
 (3.8709959060718786e-05, 'Facility'),
 (2.737649823195753e-05, 'Parking'),
 (1.271147918156903e-05, 'Office'),
 (9.21529171303661e-06, 'General Medical'),
 (3.0440195643457726e-06, 'occ'),
 (2.277295391964424e-06, 'Retail'),
 (2.051774663190544e-06, 'Other'),
 (1.954681110713953e-06, 'Entertainment'),
 (1.1084049520911949e-07, 'Food Service')]

In [83]:
final_df = pd.DataFrame({
    'Features' : X_train.columns,
    'Weight' : regressor.feature_importances_

})
final_df

Unnamed: 0,Features,Weight
0,Entertainment,1.954681e-06
1,Facility,3.870996e-05
2,Food Service,1.108405e-07
3,General Medical,9.215292e-06
4,Hotel,0.001186547
5,Multifamily,7.075261e-05
6,Office,1.271148e-05
7,Other,2.051775e-06
8,Parking,2.73765e-05
9,Retail,2.277295e-06


In [84]:
final_df.to_csv(r'FR_withFloorPlan_weight.csv',index=False)

#### Real x Predict
_____

pd.DataFrame({
    'Real' : Y_test,
    'Predict' : y_prediction

}).head(10)

#### Model Score
__________

In [85]:
print(f"Training Data Score: {regressor.score(X_train, Y_train)}")
print(f"Testing Data Score: {regressor.score(X_test, Y_test)}")

Training Data Score: 0.8652086215293374
Testing Data Score: 0.7721871320314099


In [86]:
print('MAE:', metrics.mean_absolute_error(Y_test, y_prediction))
print('MSE:', metrics.mean_squared_error(Y_test, y_prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, y_prediction)))

MAE: 32.655756589147266
MSE: 490477.7082378099
RMSE: 700.341137045233


#### Applying K-Fold Cross Validation
_____

In [68]:
accuracies = cross_val_score(estimator = regressor, X = X_train, y = Y_train, cv = 10)

In [69]:
print (accuracies)

[-3.39992104e+00  2.94699701e-02 -8.22267281e-01 -5.98284926e+01
 -2.23714111e+01 -1.28975911e+01  9.25914545e-01 -2.69965441e-03
  1.94658361e-02  3.61179962e-01]


In [70]:
accuracies.mean()

-9.798635247010147

In [71]:
accuracies.std()

18.164163890131377

#### Save model into disk
_____

In [72]:
pickle.dump(regressor, open('rfr_buildings.pkl','wb'))