# Time Series Prediction


In [1]:
# @ Lukman copyright 
# MIT Licence

In [None]:
# for data frame analysis
import pandas as pd 

# for mathematical operations
import numpy as np 

# imports below are for plotly 
import ipywidgets as widgets
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)   # for offline mode use
import plotly.figure_factory as ff
import plotly.offline as offline


# matplotlib library for plotting
import matplotlib.pyplot as plt
%matplotlib inline

# For Normalizing data
from sklearn.preprocessing import StandardScaler

# For statistical test
import scipy.stats as stats

# Split data set into training and test set
from sklearn.model_selection import train_test_split as tts

# SVN module
from sklearn import svm

# Kernel Functions used 
from sklearn.metrics.pairwise import rbf_kernel,laplacian_kernel

# module for chi square test
from scipy.stats import chisquare


# For dictionary 
from collections import defaultdict

# for use of tensorflow
import tensorflow as tf
#from tensorflow.nn.rnn import *
from tensorflow.python.ops  import *

# for scaling arrays
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler


# for random sampling of validation set
import random

# config file
import yaml
import os

# auto reload
%load_ext autoreload

%autoreload 2

In [1]:
################################################################################
# load config file
try:
    with open("../config.yml", 'r') as ymlfile:
            cfg = yaml.safe_load(ymlfile)
except (IOError):
    print('config file is required. Put config file in current directory')
################################################################################

NameError: name 'yaml' is not defined

In [71]:
# this is where the data us housed .
# we need the to set base directory in other to 
# work from there
cfg['rawdatapath']['TrainDir']

'Dataset/trainData/'

In [45]:
# current working directory
cwd = os.getcwd()
# set the base directory. base directo
BASE_DIR = os.path.join( os.path.dirname( cwd), '' )

In [72]:
# Training data path
# add as much data to the train path
# but specify your choice of training data set 
# here
tranDataPath = BASE_DIR + cfg['rawdatapath']['TrainDir']+cfg['rawdatapath']['trainDatasetName']

In [95]:
# Validation Data path data path
# add as much data to the train path
# but specify your choice of training data set 
# here
validDataPath = BASE_DIR + cfg['rawdatapath']['TrainDir']+cfg['rawdatapath']['validationDataName']

'/home/bigdata-lap/Desktop/mypapers/time series/ashrae-energy-prediction/Energy_Prediction_Bot-master/Dataset/trainData/cleaned/'

# Data Preprocessing 

In [34]:
# I will Load only the zero meter data set 
# same syntax applied for other data set
meterOneDataLOaded = pd.read_csv(tranDataPath)
# all meter types data set could be analysed in same manner

In [3]:
meterOneDataLOaded[1:3]

Unnamed: 0.1,Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
1,1,162,1,2016-01-01 00:00:00,0.0,2,Education,98829,1968.0,,15.6,6.0,-5.6,,1015.3,270.0,3.6
2,2,163,1,2016-01-01 00:00:00,4.5719,2,Education,72102,1970.0,,15.6,6.0,-5.6,,1015.3,270.0,3.6


In [48]:
# delete column unnmaed
del meterOneDataLOaded['Unnamed: 0']

In [49]:
# column values 
meterOneDataLOaded.columns.tolist()

['building_id',
 'meter',
 'timestamp',
 'meter_reading',
 'site_id',
 'primary_use',
 'square_feet',
 'year_built',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'precip_depth_1_hr',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed']

In [50]:
# store column as list
columns = meterOneDataLOaded.columns.tolist()

In [51]:
# get null values per column in the data set
meterOneDataLOaded.isnull().sum()

building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
year_built            2819559
floor_count           3972549
air_temperature         23502
cloud_coverage        1742296
dew_temperature         24341
precip_depth_1_hr      541565
sea_level_pressure     105047
wind_direction         402544
wind_speed              37330
dtype: int64

In [52]:
# chnage to time sta
meterOneDataLOaded['timestamp'] =  pd.to_datetime(meterOneDataLOaded['timestamp'])


In [53]:
# check the unique time stamp present
pd.DatetimeIndex(meterOneDataLOaded['timestamp']).year.unique()

Int64Index([2016], dtype='int64', name='timestamp')

In [54]:
meterOneDataLOaded.dtypes

building_id                    int64
meter                          int64
timestamp             datetime64[ns]
meter_reading                float64
site_id                        int64
primary_use                   object
square_feet                    int64
year_built                   float64
floor_count                  float64
air_temperature              float64
cloud_coverage               float64
dew_temperature              float64
precip_depth_1_hr            float64
sea_level_pressure           float64
wind_direction               float64
wind_speed                   float64
dtype: object

In [55]:
# choose windpseed because it has fewer null values
meterOneDataLOaded.isnull().sum()

building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
year_built            2819559
floor_count           3972549
air_temperature         23502
cloud_coverage        1742296
dew_temperature         24341
precip_depth_1_hr      541565
sea_level_pressure     105047
wind_direction         402544
wind_speed              37330
dtype: int64

In [56]:
# set time stamp as index 
meterOneDataLOaded.set_index('timestamp',inplace=True);


In [57]:
meterOneDataLOaded['dew_temperature'][1:2]

timestamp
2016-01-01   -5.6
Name: dew_temperature, dtype: float64

#### Model 


In [74]:
# a subset of the data has been copied to use to test the visualizer function
# use the full data set if space is available and system is fast

train_test = meterOneDataLOaded[['meter_reading', 'square_feet', 'air_temperature','primary_use','site_id','dew_temperature']].copy()


In [75]:

# reset the index so that time is no longer the index
# the index is now  numbers 
train_test.reset_index(level=0, inplace=True)


In [76]:
# only 2016 data is represented here as usual
pd.DatetimeIndex(train_test['timestamp']).year.unique()

Int64Index([2016], dtype='int64', name='timestamp')

In [77]:
# extrac year month and day
train_test['year'] = pd.DatetimeIndex(train_test['timestamp']).year
train_test['month'] = pd.DatetimeIndex(train_test['timestamp']).month
train_test['day'] = pd.DatetimeIndex(train_test['timestamp']).day
train_test['hour'] = pd.DatetimeIndex(train_test['timestamp']).hour

In [78]:
# check null values
train_test.isnull().sum()

timestamp              0
meter_reading          0
square_feet            0
air_temperature    23502
primary_use            0
site_id                0
dew_temperature    24341
year                   0
month                  0
day                    0
hour                   0
dtype: int64

In [79]:
# check data types
train_test.dtypes

timestamp          datetime64[ns]
meter_reading             float64
square_feet                 int64
air_temperature           float64
primary_use                object
site_id                     int64
dew_temperature           float64
year                        int64
month                       int64
day                         int64
hour                        int64
dtype: object

In [103]:
# forward filling missing values since values from previous timestamp should
# ideally be similar to the next one. (temperature today and tomorrow should be quite similar)
train_test['air_temperature'].fillna(method='ffill', inplace=True)
train_test['dew_temperature'].fillna(method='ffill', inplace=True)

In [80]:
# delete year we wont use the year information in the model
del train_test['year']

In [81]:
# encode site id and primary use
train_test = pd.get_dummies(train_test, columns=["primary_use","site_id"])

In [82]:
train_test.head()

Unnamed: 0,timestamp,meter_reading,square_feet,air_temperature,dew_temperature,month,day,hour,primary_use_Education,primary_use_Entertainment/public assembly,...,site_id_0,site_id_2,site_id_6,site_id_7,site_id_9,site_id_10,site_id_11,site_id_13,site_id_14,site_id_15
0,2016-01-01,0.0,11329,15.6,-5.6,1,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,2016-01-01,0.0,98829,15.6,-5.6,1,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,2016-01-01,4.5719,72102,15.6,-5.6,1,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,2016-01-01,209.886,553210,15.6,-5.6,1,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,2016-01-01,0.0,86323,15.6,-5.6,1,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [83]:
# number of zeros
train_test[train_test['meter_reading'] <= 0.]['meter_reading'].count() 

656504

In [85]:
# remove zero meter readings 
# to be changed later
train_test = train_test[train_test.meter_reading != 0]

In [24]:
# set time as index
train_test.set_index('timestamp',inplace=True);


In [86]:
# no zeros in data set
train_test[train_test['meter_reading'] <= 0.]['meter_reading'].count() 

0

In [87]:
# shape after zeros removed
train_test.shape

(3525936, 32)

In [97]:
cleanedpath= BASE_DIR + cfg['cleanedconfig']['cleanedDataV1']['cleanedDir'] #+cfg['rawdatapath']['validationDataName']

In [105]:
# save data 
train_test.to_csv(cleanedpath +  cfg['cleanedconfig']['cleanedDataV1']['cleanedName'])

# Conclusion
LSTM works fine on Time series but one needs care so that things do not blow up.
On a very suphiscated machine, this model could work impeccably well.
ARIMA,SARIMA are also very powerful models but they have less representational power compared to lstm
