In [156]:
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
import seaborn as sb


from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score



import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing import image
from keras.wrappers.scikit_learn import KerasClassifier

## Loading the dataframe
Loading the dataframe from the pickle file which was exported from Pre processing stage

In [132]:
with open("data/df.pkl",'rb') as file:
    df = pickle.load(file)

df

Unnamed: 0,Name,GFA,BType,Energy,WeekDay,Hour,TAVG,TMIN,TMAX,Sem
0,Piper Writing Center,4314.0,Academic,1.402795493602753,1,00:00:00,43.0,38,51,0
1,Piper Writing Center,4314.0,Academic,2.6063907146453857,1,01:00:00,43.0,38,51,0
2,Piper Writing Center,4314.0,Academic,0.8264167606830597,1,02:00:00,43.0,38,51,0
3,Piper Writing Center,4314.0,Academic,0.831238254904747,1,03:00:00,43.0,38,51,0
4,Piper Writing Center,4314.0,Academic,4.9978625774383545,1,04:00:00,43.0,38,51,0
...,...,...,...,...,...,...,...,...,...,...
602573,San Pablo,77484.0,Housing,51.58279991149902,1,19:00:00,54.0,45,59,0
602574,San Pablo,77484.0,Housing,52.42500019073486,1,20:00:00,54.0,45,59,0
602575,San Pablo,77484.0,Housing,50.46094989776611,1,21:00:00,54.0,45,59,0
602576,San Pablo,77484.0,Housing,51.54427433013916,1,22:00:00,54.0,45,59,0


# Data Understanding and Transformation

In [133]:
df.dtypes

Name        object
GFA        float64
BType       object
Energy      object
WeekDay      int64
Hour        object
TAVG       float64
TMIN         int64
TMAX         int64
Sem          int64
dtype: object

Converting Energy datatype from string to float

In [134]:
df['Energy'] = df['Energy'].apply(lambda x:float(x)).round(1)

#### Checking for negative and missing values for energy

In [135]:
df.loc[df['Energy']<0,['Energy']].count()

Energy    205
dtype: int64

In [136]:
df.loc[df['Energy']<0,['Name']].nunique()

Name    2
dtype: int64

In [137]:
df.isna().sum()

Name           0
GFA        17520
BType      17520
Energy         0
WeekDay        0
Hour           0
TAVG           0
TMIN           0
TMAX           0
Sem            0
dtype: int64

#### Removing the negative and missing values

In [138]:
df = df[df['Energy']>0].dropna()

In [139]:
df['Energy'].describe()

count    571373.000000
mean        109.248414
std         121.975879
min           0.100000
25%          34.900000
50%          66.100000
75%         137.100000
max         757.300000
Name: Energy, dtype: float64

In [140]:
df['Energy'].apply(lambda x:x)

0          1.4
1          2.6
2          0.8
3          0.8
4          5.0
          ... 
602573    51.6
602574    52.4
602575    50.5
602576    51.5
602577    49.7
Name: Energy, Length: 571373, dtype: float64

In [141]:
with open('Data/df_cleaned.pkl','wb') as file:
    pickle.dump(df,file)

In [142]:
df

Unnamed: 0,Name,GFA,BType,Energy,WeekDay,Hour,TAVG,TMIN,TMAX,Sem
0,Piper Writing Center,4314.0,Academic,1.4,1,00:00:00,43.0,38,51,0
1,Piper Writing Center,4314.0,Academic,2.6,1,01:00:00,43.0,38,51,0
2,Piper Writing Center,4314.0,Academic,0.8,1,02:00:00,43.0,38,51,0
3,Piper Writing Center,4314.0,Academic,0.8,1,03:00:00,43.0,38,51,0
4,Piper Writing Center,4314.0,Academic,5.0,1,04:00:00,43.0,38,51,0
...,...,...,...,...,...,...,...,...,...,...
602573,San Pablo,77484.0,Housing,51.6,1,19:00:00,54.0,45,59,0
602574,San Pablo,77484.0,Housing,52.4,1,20:00:00,54.0,45,59,0
602575,San Pablo,77484.0,Housing,50.5,1,21:00:00,54.0,45,59,0
602576,San Pablo,77484.0,Housing,51.5,1,22:00:00,54.0,45,59,0


# One hot encoding Categorical

In [165]:
BTypeDF = pd.get_dummies(df['BType'],prefix='BType')
HourDF = pd.get_dummies(df['Hour'],prefix='Hour')
df = pd.concat([df,BTypeDF,HourDF],axis =1)
df

Unnamed: 0,Name,GFA,BType,Energy,WeekDay,Hour,TAVG,TMIN,TMAX,Sem,...,Hour_14:00:00,Hour_15:00:00,Hour_16:00:00,Hour_17:00:00,Hour_18:00:00,Hour_19:00:00,Hour_20:00:00,Hour_21:00:00,Hour_22:00:00,Hour_23:00:00
0,Piper Writing Center,4314.0,Academic,1.4,1,00:00:00,43.0,38,51,0,...,0,0,0,0,0,0,0,0,0,0
1,Piper Writing Center,4314.0,Academic,2.6,1,01:00:00,43.0,38,51,0,...,0,0,0,0,0,0,0,0,0,0
2,Piper Writing Center,4314.0,Academic,0.8,1,02:00:00,43.0,38,51,0,...,0,0,0,0,0,0,0,0,0,0
3,Piper Writing Center,4314.0,Academic,0.8,1,03:00:00,43.0,38,51,0,...,0,0,0,0,0,0,0,0,0,0
4,Piper Writing Center,4314.0,Academic,5.0,1,04:00:00,43.0,38,51,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602573,San Pablo,77484.0,Housing,51.6,1,19:00:00,54.0,45,59,0,...,0,0,0,0,0,1,0,0,0,0
602574,San Pablo,77484.0,Housing,52.4,1,20:00:00,54.0,45,59,0,...,0,0,0,0,0,0,1,0,0,0
602575,San Pablo,77484.0,Housing,50.5,1,21:00:00,54.0,45,59,0,...,0,0,0,0,0,0,0,1,0,0
602576,San Pablo,77484.0,Housing,51.5,1,22:00:00,54.0,45,59,0,...,0,0,0,0,0,0,0,0,1,0


# Prediction Modeling

## Simple Linear Regression

In [9]:
values = df.drop(['TMIN','TMAX','Name','Hour'],axis=1).values

#scaling

scaler = MinMaxScaler(feature_range = (0,1))
ScaledValues = pd.DataFrame(scaler.fit_transform(values))
X = ScaledValues

In [None]:
#ran the model on EC2 instance for performance

LinReg = LinearRegression(normalize = True)
LinReg.fit(X,y)
score  = LinReg.score(X,y)

In [146]:
file = open("Data/linear.pkl",'rb')
score = pickle.load(file)

score

0.1330667617808884

## Neural Networks

In [179]:
featureData = df.drop(['Energy','Name','BType','Hour'],axis=1).values

#scaling

scaler = MinMaxScaler(feature_range = (0,1))
X = scaler.fit_transform(featureData)

In [180]:
X.shape

(571373, 34)

In [177]:
Energy = df['Energy'].values

bins = np.linspace(0,1000,20)
Energy = pd.cut(Energy,bins,labels = False)

In [167]:
EnergyDF =pd.DataFrame(Energy)
EnergyDF.nunique()

0    15
dtype: int64

In [178]:
y = pd.get_dummies(Energy,prefix='Energy')
y

Unnamed: 0,Energy_0,Energy_1,Energy_2,Energy_3,Energy_4,Energy_5,Energy_6,Energy_7,Energy_8,Energy_9,Energy_10,Energy_11,Energy_12,Energy_13,Energy_14
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571368,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
571369,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
571370,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
571371,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Initializing and parameter tuning

In [181]:
model3 = Sequential()
model3.add(Dense(50,input_dim=34,activation='relu', name = 'layer_1'))
model3.add(Dense(20,activation='relu',name = 'layer_2'))
model3.add(Dropout(0.5))
model3.add(Dense(15,activation='softmax',name = 'output_layer'))

model3.compile(loss='categorical_crossentropy', optimizer='SGD', metrics=['accuracy'])

In [182]:
model3.fit(X,y,epochs=100,shuffle=True,verbose=2)

error_rate,accuracy = model3.evaluate(X,y,verbose=0)

Epoch 1/100
17856/17856 - 12s - loss: 1.5295 - accuracy: 0.4518
Epoch 2/100
17856/17856 - 11s - loss: 1.3775 - accuracy: 0.5010
Epoch 3/100
17856/17856 - 12s - loss: 1.3552 - accuracy: 0.5077
Epoch 4/100
17856/17856 - 12s - loss: 1.3388 - accuracy: 0.5148
Epoch 5/100
17856/17856 - 12s - loss: 1.3222 - accuracy: 0.5211
Epoch 6/100
17856/17856 - 12s - loss: 1.3057 - accuracy: 0.5243
Epoch 7/100
17856/17856 - 12s - loss: 1.2897 - accuracy: 0.5280
Epoch 8/100
17856/17856 - 11s - loss: 1.2788 - accuracy: 0.5303
Epoch 9/100
17856/17856 - 12s - loss: 1.2684 - accuracy: 0.5331
Epoch 10/100
17856/17856 - 12s - loss: 1.2592 - accuracy: 0.5360
Epoch 11/100
17856/17856 - 12s - loss: 1.2519 - accuracy: 0.5386
Epoch 12/100
17856/17856 - 12s - loss: 1.2454 - accuracy: 0.5399
Epoch 13/100
17856/17856 - 12s - loss: 1.2373 - accuracy: 0.5434
Epoch 14/100
17856/17856 - 12s - loss: 1.2319 - accuracy: 0.5447
Epoch 15/100
17856/17856 - 11s - loss: 1.2273 - accuracy: 0.5462
Epoch 16/100
17856/17856 - 12s - l

### Evaluating with K fold Cross validation

In [None]:
#Building the model and evaluating using K fold cross validation
'''
RunName = 'model4'
def createmodel():
    model = Sequential()
    model.add(Dense(10,input_dim=59,activation='relu'))
    modle.add(Dense(5,activation='relu'))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

seed = 7
np.random.seed(seed)

model = KerasClassifier(build_fn = createmodel,epochs = 50, batch_size = 10, verbose = 0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(model, X, y, cv=kfold)
print(results.mean())'''