# Support Vector Regression
____

#### Libraries
____

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

import statsmodels.formula.api as sm

import pickle

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

#### Load dataset
___

In [38]:
dataset = '../../../data/bpd/build_clean.csv'
building = pd.read_csv(dataset)
building.head(5)

Unnamed: 0,bname,build_year,floor_area,occupy,water_use,ws_ngas,ws_elec,site_eui
0,Multifamily,1903,412503.0,100,22061.0,0.0,3249755.2,32.0
1,General Medical,1950,567517.0,100,9574.0,2.2,18295844.6,352.7
2,Multifamily,1939,321464.0,100,5915.6,0.0,1121931.8,76.2
3,Multifamily,1911,54913.0,100,231.9,0.0,262377.2,17.9
4,Multifamily,1964,535632.0,100,8984.2,1.2,1620.0,115.9


In [39]:
building.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6529 entries, 0 to 6528
Data columns (total 8 columns):
bname         6529 non-null object
build_year    6529 non-null int64
floor_area    6529 non-null float64
occupy        6529 non-null int64
water_use     6529 non-null float64
ws_ngas       6529 non-null float64
ws_elec       6529 non-null float64
site_eui      6529 non-null float64
dtypes: float64(5), int64(2), object(1)
memory usage: 408.1+ KB


In [40]:
building.describe()

Unnamed: 0,build_year,floor_area,occupy,water_use,ws_ngas,ws_elec,site_eui
count,6529.0,6529.0,6529.0,6529.0,6529.0,6529.0,6529.0
mean,1949.651095,136859.9,99.152244,16030.62,1.977179,1476976.0,260.574575
std,30.901407,171146.0,5.532777,157196.0,116.266859,9362351.0,10897.697301
min,1649.0,500.0,0.0,0.0,0.0,0.0,0.0
25%,1927.0,64677.0,100.0,2688.4,0.1,287372.9,64.4
50%,1941.0,87036.0,100.0,4786.8,0.5,485308.3,79.2
75%,1967.0,138728.0,100.0,8001.2,0.7,1058213.0,96.8
max,2019.0,2376683.0,100.0,6594604.0,9393.0,452023600.0,869265.0


#### Filtering columns based on LR OLS analysis
_____

In [41]:
columns = ['floor_area', 'ws_ngas', 'ws_elec', 'site_eui']

building = building[columns]

building.head(5)

Unnamed: 0,floor_area,ws_ngas,ws_elec,site_eui
0,412503.0,0.0,3249755.2,32.0
1,567517.0,2.2,18295844.6,352.7
2,321464.0,0.0,1121931.8,76.2
3,54913.0,0.0,262377.2,17.9
4,535632.0,1.2,1620.0,115.9


#### Feature Scaling
___

In [42]:
scaler = StandardScaler()
scaler.fit(building.drop(['site_eui'], axis=1))
scaled_features = scaler.transform(building.drop(['site_eui'], axis=1))

building_scaled = pd.DataFrame(scaled_features,columns=building.drop(['site_eui'], axis=1).columns)

building_scaled.head(5)

Unnamed: 0,floor_area,ws_ngas,ws_elec
0,1.610696,-0.017007,0.189366
1,2.516507,0.001917,1.796574
2,1.078718,-0.017007,-0.037925
3,-0.478849,-0.017007,-0.129742
4,2.33019,-0.006685,-0.157596


#### Converting Categorical Features
___

In [43]:
#bname_categorical = pd.get_dummies(building['bname'])

# Avoiding the Dummy Variable Trap
#bname_categorical.drop(bname_categorical.columns[0], axis=1, inplace=True)

# Merge datasets
#building.drop(['bname'], axis=1, inplace=True)
#building = pd.concat([bname_categorical, building], axis=1)
#building = pd.concat([bname_categorical, building_scaled, building['site_eui']], axis=1)  ## if feature scale is necessary??

building = pd.concat([building_scaled, building['site_eui']], axis=1)
building.head(5)

Unnamed: 0,floor_area,ws_ngas,ws_elec,site_eui
0,1.610696,-0.017007,0.189366,32.0
1,2.516507,0.001917,1.796574,352.7
2,1.078718,-0.017007,-0.037925,76.2
3,-0.478849,-0.017007,-0.129742,17.9
4,2.33019,-0.006685,-0.157596,115.9


#### Define X, Y variables
___

In [44]:
X = building.drop('site_eui', axis=1)
Y = building['site_eui']

print (X.shape, Y.shape)

(6529, 3) (6529,)


#### Split dataset (TRAIN/TEST) data
____

In [45]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 101)

#### Fitting SVR to the dataset
____

In [47]:
regressor = SVR(kernel='linear')
regressor.fit(X_train, Y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

#### Fitting Simple Linear Regression to the Training Set
_____

In [48]:
y_prediction = regressor.predict(X_test)

#y_pred = scale_Y.inverse_transform(regressor.predict(scale_X.transform(np.array([[6.5]]))))  # original values without scale
#print(y_pred)

#### Real x Predict
_____

In [49]:
pd.DataFrame({
    'Real' : Y_test,
    'Predict' : y_prediction

}).head(10)

Unnamed: 0,Real,Predict
1233,66.5,77.096534
1439,57.7,79.775724
4067,57.1,80.996613
5867,44.1,84.489829
1222,115.9,78.026933
2678,108.4,80.051496
4551,54.4,78.488649
1722,78.0,80.049275
6101,163.1,80.779785
4206,84.2,81.630618


#### Model Score
__________

In [50]:
print(f"Training Data Score: {regressor.score(X_train, Y_train)}")
print(f"Testing Data Score: {regressor.score(X_test, Y_test)}")

Training Data Score: 0.015977529102492505
Testing Data Score: 0.16505680349191254


#### GridSearch
__________

In [51]:
#param_grid = [{'C': [0.1, 1, 10, 100, 1000], 'kernel' : ['linear']},
#              {'C': [0.1, 1, 10, 100, 1000], 'kernel' : ['poly']},
#              {'C': [0.1, 1, 10, 100, 1000], 'kernel' : ['sigmoid']},
#              {'C': [0.1, 1, 10, 100, 1000], 'kernel' : ['rbf'], 'gamma' : [0.5, 0.1, 0.01, 0.001, 0.0001]}
#             ]

param_grid = [{'C': [0.1, 1, 10, 100, 1000], 'kernel' : ['linear']}]

grid = GridSearchCV(estimator = regressor, 
                    param_grid = param_grid,                    
                    cv = 10,
                    refit=True,
                    verbose=0)

grid.fit(X_train,Y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

#### Best Results
__________

In [52]:
grid.best_params_

{'C': 100, 'kernel': 'linear'}

In [53]:
grid.best_score_

0.31172971703657565

#### Save model into disk
_____

In [54]:
regressor = SVR(kernel='linear', C=100)

In [55]:
pickle.dump(regressor, open('svr_buildings.pkl','wb'))