In [None]:
#Imports Cell
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error
from statsmodels.tsa.arima_model import ARIMA
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import timeit
from math import floor

import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.utils import np_utils
from keras.layers import LSTM,GRU
from keras.layers import Dropout
from keras.losses import mean_absolute_percentage_error

import os # accessing directory structure

In [None]:
#Reading the file to dataframe
#mountDrive()
def ReadAndParse():
  #keep_date_col=True
  df = pd.read_csv('../input/household_power_consumption.txt', sep=';', keep_date_col=True,
                 parse_dates={'dt' : ['Date', 'Time']},infer_datetime_format=True, 
                 low_memory=False, na_values=['nan','?'], index_col='dt')
  df2 = pd.read_csv('../input/household_power_consumption.txt', sep=';',na_values=['nan','?'])
  #df["Time2"] = pd.to_datetime(df['Date']+' '+df["Time"])
  
  df = df.fillna(method='ffill')
  df2 = df2.fillna(method='ffill')
  df2.index = df.index
  df2 = df2.drop('Voltage', 1)
  df2['Date']=pd.to_datetime(df2['Date'], format="%d/%m/%Y")
  df2['Time']=pd.to_datetime(df2['Time'], format="%H:%M:%S")
  return df2
window_size = 24


### **Data Exploration**

In [None]:
print(df.info())
print ("\nshape: "+str(df.shape))

df.Global_active_power.resample('M').sum().plot(title='Global_active_power over month for sum', color = "yellow") 
plt.tight_layout()
plt.show()   

df.Global_active_power.resample('M').mean().plot(title='Global_active_power over month for mean', color='red') 
plt.tight_layout()
plt.show()

df['Global_active_power'].resample('Q').mean().plot(kind='bar',color="blue")
plt.xticks(rotation=60)
plt.ylabel('Global_active_power')
plt.title('Global_active_power per quarter (averaged over quarter)')
plt.show()

df['Voltage'].resample('M').mean().plot(kind='bar', color='red')
plt.xticks(rotation=60)
plt.ylabel('Voltage')
plt.title('Voltage per quarter (summed over quarter)')
plt.show()

cols = [0, 1, 2, 3, 5, 6]
i = 1
groups=cols
values = df.resample('D').mean().values
# plot each column
plt.figure(figsize=(15, 10))
for group in groups:
	plt.subplot(len(cols), 1, i)
	plt.plot(values[:, group])
	plt.title(df.columns[group], y=0.75, loc='right')
	i += 1
plt.show()

### **Pre Processingt**

In [None]:
def pre_processing(df):
  df['Date'] = df['Date'].apply(lambda x: x.month)
  df['Time'] = df['Time'].apply(lambda x: x.hour)
  df['Date']=df['Date'].astype('float64')
  df['Time']=df['Time'].astype('float64')
   
  df_copy=df
  df = (df - df.mean())/(df.std())
  df['Global_active_power'] = df_copy['Global_active_power']
  df = df.fillna(method='ffill')
  df = df.resample("6min").mean()
  return df.values

In [None]:
time_foward = 240
units=10
start = timeit.default_timer()
df = ReadAndParse()
df = pre_processing(df)
stop = timeit.default_timer()
print("shape "+ str(df.shape))
print('Time: ', stop - start)

In [None]:
def CreateWindow():
  size = len(df)
  size2 = (floor((size-window_size)/10))-45
  print(size2)
  xy = np.zeros((size2, window_size,units*df.shape[1]))
  for i in range(0,size-window_size,10):
    xy2 = np.zeros((window_size,units*df.shape[1]))
    for j in range(window_size):
        df_temp = df[i+j:i+j+units,:]
        xy2[j,:] = df_temp.ravel()    
    if (floor(i/10)>size2-1):
        break
    xy[floor(i/10),:,:]=xy2
  return xy;
start = timeit.default_timer()
x = CreateWindow()
stop = timeit.default_timer()
print("shape "+ str(x.shape))
print('Time: ', stop - start)

In [None]:
def CreatePreds():
  preds = df.iloc[49:, :]
  preds =  preds["Global_active_power"].values
  return preds

df = ReadAndParse()
df = df.resample("H").mean()
preds = CreatePreds()
print ("x.shape: "+str(x.shape))
print ("preds.shape: "+str(preds.shape))

### **k-fold Cross validation**

In [None]:
def CreateKfold1():
  fold1a = 0
  #fold1
  f_train_x = train_x[:third*2-window_size, :]
  f_train_y = train_y[:third*2-window_size]
  f_test_x = train_x[third*2+window_size:, :]
  f_test_y = train_y[third*2+window_size:]
  fold1a = [f_train_x, f_train_y,f_test_x,f_test_y]
  return fold1a

def CreateKfold2():
  fold2a = 0
  #fold2
  f_train_x = train_x[third+window_size:, :]
  f_train_y = train_y[third+window_size:]
  f_test_x = train_x[:third-window_size, :]
  f_test_y = train_y[:third-window_size]
  fold2a = [f_train_x, f_train_y,f_test_x,f_test_y]
  return fold2a

def CreateKfold3():
  fold3a = 0
  #fold3
  #parta
  f_train_x_parta = train_x[:third-window_size, :]
  f_train_y_parta = train_y[:third-window_size]
  f_test_x = train_x[third + window_size: third*2 - window_size, :]
  f_test_y = train_y[third + window_size: third*2 - window_size]
  #partb
  f_train_x_partb = train_x[third*2+window_size:, :]
  f_train_y_partb = train_y[third*2+window_size:]
  #conc
  f_train_x = np.concatenate((f_train_x_parta,f_train_x_partb), axis=0)
  f_train_y = np.concatenate((f_train_y_parta,f_train_y_partb), axis=0)
  fold3a = [f_train_x, f_train_y,f_test_x,f_test_y]
  return fold3a
def Create3Folds():
    fold1 = CreateKfold1()
    fold2 = CreateKfold2()
    fold3 = CreateKfold3()
    return fold1,fold2,fold3

### **Train-Test Split**

In [None]:
# minutes per hour * hours per day * days per week
# weeks per year * 3 years of training
print(len(x))
n_train_time = floor(len(x)*0.8)
train_x = x[floor(len(x)*0.25):n_train_time-window_size, :]
test_x = x[n_train_time+window_size:, :]
train_y = preds[floor(len(x)*0.25):n_train_time-window_size]
test_y = preds[n_train_time+window_size:]
print (train_x.shape, train_y.shape, test_x.shape, test_y.shape)
third = floor(train_x.shape[0]/3);
fold1,fold2,fold3 = Create3Folds()
nb_epochs=10;

### **Navie BaseLine**

In [None]:
mean_Global_active_power = preds.mean()
yhat = np.full(test_y.shape,mean_Global_active_power)
rmse = np.sqrt(mean_squared_error(yhat, test_y))
MAE = mean_absolute_error(yhat, test_y)
print('Test RMSE: %.3f' % rmse)
print('Test MAE: %.3f' % MAE)
aa=[x for x in range(200)]
plt.plot(aa, test_y[:200], marker='.', label="actual")
plt.plot(aa, yhat[:200], 'r', label="prediction")
plt.ylabel('Global_active_power', size=15)
plt.xlabel('Time step', size=15)
plt.legend(fontsize=15)
plt.show()

### **Logistic Regression**

In [None]:
#train_X.shape, train_y.shape, test_X.shape, test_y.shape
train_x_lr = train_x.reshape((train_x.shape[0],train_x.shape[1]*train_x.shape[2]))
test_x_lr = test_x.reshape((test_x.shape[0],train_x.shape[1]*train_x.shape[2]))
logreg = LinearRegression()
logreg.fit(train_x_lr, train_y)

In [None]:
y_pred = logreg.predict(test_x_lr)
rmse = np.sqrt(mean_squared_error(y_pred, test_y))
MAE = mean_absolute_error(y_pred, test_y)
print('LL RMSE: %.3f' % rmse)
print('LL MAE: %.3f' % MAE)
aa=[x for x in range(200)]
plt.plot(aa, test_y[:200], marker='.', label="actual")
plt.plot(aa, y_pred[:200], 'r', label="prediction")
plt.ylabel('Global_active_power', size=15)
plt.xlabel('Time step', size=15)
plt.legend(fontsize=10)
plt.show()

### **NN model**

In [None]:
#fold1,fold2,fold3
model = Sequential()
model.add(LSTM(1, input_shape=(fold1[0].shape[1], fold1[0].shape[2])))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()
history = model.fit(fold1[0], fold1[1], epochs=15,shuffle=True, batch_size=5000, verbose=1,validation_data=(fold1[2],fold1[3]))
y_pred = model.predict(test_x,batch_size=5000)
rmse1 = np.sqrt(mean_squared_error(y_pred, test_y))
MAE1 = mean_absolute_error(y_pred, test_y)
print('Test RMSE: %.3f' % rmse1)
print('Test MAE: %.3f' % MAE1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()
aa=[x for x in range(500)]
plt.plot(aa, test_y[:500], marker='.', label="actual")
plt.plot(aa, y_pred[:500], 'r', label="prediction")
plt.ylabel('Global_active_power', size=15)
plt.xlabel('Time step', size=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
#fold1,fold2,fold3
model2 = Sequential()
model2.add(LSTM(1, input_shape=(fold2[0].shape[1], fold2[0].shape[2])))
model2.add(Dense(1))
model2.compile(loss='mean_squared_error', optimizer='adam')
model2.summary()
history2 = model2.fit(fold2[0], fold2[1], epochs=15,shuffle=True, batch_size=5000, verbose=1,validation_data=(fold2[2],fold2[3]))
y_pred = model2.predict(test_x,batch_size=5000)
rmse2 = np.sqrt(mean_squared_error(y_pred, test_y))
MAE2 = mean_absolute_error(y_pred, test_y)
print('Test RMSE: %.3f' % rmse2)
print('Test MAE: %.3f' % MAE2)
plt.plot(history2.history['loss'])
plt.plot(history2.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()
aa=[x for x in range(500)]
plt.plot(aa, test_y[:500], marker='.', label="actual")
plt.plot(aa, y_pred[:500], 'r', label="prediction")
plt.ylabel('Global_active_power', size=15)
plt.xlabel('Time step', size=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
#fold1,fold2,fold3
model3 = Sequential()
model3.add(LSTM(1, input_shape=(fold3[0].shape[1], fold3[0].shape[2])))
model3.add(Dense(1))
model3.compile(loss='mean_squared_error', optimizer='adam')
model3.summary()
history3 = model3.fit(fold3[0], fold3[1], epochs=15,shuffle=True, batch_size=5000, verbose=1,validation_data=(fold3[2],fold3[3]))
y_pred = model3.predict(test_x,batch_size=5000)
rmse3 = np.sqrt(mean_squared_error(y_pred, test_y))
MAE3 = mean_absolute_error(y_pred, test_y)
print('Test RMSE: %.3f' % rmse3)
print('Test MAE: %.3f' % MAE3)
plt.plot(history3.history['loss'])
plt.plot(history3.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()
aa=[x for x in range(500)]
plt.plot(aa, test_y[:500], marker='.', label="actual")
plt.plot(aa, y_pred[:500], 'r', label="prediction")
plt.ylabel('Global_active_power', size=15)
plt.xlabel('Time step', size=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
rmse_avg = (rmse1+rmse2+rmse3)/3
print("RMSE avg for all 3 folds "+str(rmse_avg))
mae_avg = (MAE1+MAE2+MAE3)/3
print("MAE avg for all 3 folds "+str(mae_avg))

In [None]:
model4 = Sequential()
model4.add(LSTM(1,activation=None,return_sequences=True,input_shape=(train_x.shape[1],train_x.shape[2])))
model4.add(LSTM(1,activation=None,input_shape=(train_x.shape[1],1)))
model4.add(Dense(1))
model4.compile(loss='mean_squared_error', optimizer='adam')
model4.summary()
history4 = model4.fit(train_x, train_y, epochs=nb_epochs,shuffle=True, verbose=1,validation_data=(test_x,test_y))
y_pred = model4.predict(test_x,batch_size=5000)

In [None]:
rmse = np.sqrt(mean_squared_error(y_pred, test_y))
MAE = mean_absolute_error(y_pred, test_y)
print('Model 4 RMSE: %.3f' % rmse)
print('Model 4 MAE: %.3f' % MAE)
plt.plot(history4.history['loss'])
plt.plot(history4.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()
aa=[x for x in range(200)]
plt.plot(aa, test_y[:200], marker='.', label="actual")
plt.plot(aa, y_pred[:200], 'r', label="prediction")
plt.ylabel('Global_active_power', size=15)
plt.xlabel('Time step', size=15)
plt.legend(fontsize=10)
plt.show()