# Time Series Prediction


In [1]:
# @ Lukman copyright 
# MIT Licence

In [2]:
# for data frame analysis
import pandas as pd 

# for mathematical operations
import numpy as np 

# imports below are for plotly 
import ipywidgets as widgets
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)   # for offline mode use
import plotly.figure_factory as ff
import plotly.offline as offline


# matplotlib library for plotting
import matplotlib.pyplot as plt
%matplotlib inline

# For Normalizing data
from sklearn.preprocessing import StandardScaler

# For statistical test
import scipy.stats as stats

# Split data set into training and test set
from sklearn.model_selection import train_test_split as tts

# SVN module
from sklearn import svm

# Kernel Functions used 
from sklearn.metrics.pairwise import rbf_kernel,laplacian_kernel

# module for chi square test
from scipy.stats import chisquare


# For dictionary 
from collections import defaultdict

# for use of tensorflow
import tensorflow as tf
#from tensorflow.nn.rnn import *
from tensorflow.python.ops  import *

# for scaling arrays
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler


# for random sampling of validation set
import random

# config file
import yaml
import os

# auto reload
%load_ext autoreload

%autoreload 2

In [41]:
################################################################################
# load config file
try:
    with open("../config.yml", 'r') as ymlfile:
            cfg = yaml.safe_load(ymlfile)
except (IOError):
    print('config file is required. Put config file in current directory')
################################################################################

In [4]:
# this is where the data us housed .
# we need the to set base directory in other to 
# work from there
cfg['rawdatapath']['TrainDir']

'Dataset/trainData/'

In [5]:
# current working directory
cwd = os.getcwd()
# set the base directory. base directo
BASE_DIR = os.path.join( os.path.dirname( cwd), '' )

In [6]:
# Training data path
# add as much data to the train path
# but specify your choice of training data set 
# here
tranDataPath = BASE_DIR + cfg['rawdatapath']['TrainDir']+cfg['rawdatapath']['trainDatasetName2']

In [8]:
# Validation Data path data path
# add as much data to the train path
# but specify your choice of training data set 
# here
# validDataPath = BASE_DIR + cfg['rawdatapath']['TrainDir']+cfg['rawdatapath']['validationDataName']

# Data Preprocessing 

In [9]:
# I will Load only the zero meter data set 
# same syntax applied for other data set
meterOneDataLOaded = pd.read_feather(tranDataPath)
# all meter types data set could be analysed in same manner

In [10]:
meterOneDataLOaded[1:3]

Unnamed: 0.1,Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
1,1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


In [11]:
# delete column unnmaed
del meterOneDataLOaded['Unnamed: 0']

In [12]:
# column values 
meterOneDataLOaded.columns.tolist()

['building_id',
 'meter',
 'timestamp',
 'meter_reading',
 'site_id',
 'primary_use',
 'square_feet',
 'year_built',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'precip_depth_1_hr',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed']

In [13]:
# store column as list
columns = meterOneDataLOaded.columns.tolist()

In [14]:
# get null values per column in the data set
meterOneDataLOaded.isnull().sum()

building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
year_built            6470035
floor_count           9096083
air_temperature         47325
cloud_coverage        5329652
dew_temperature         49091
precip_depth_1_hr     2513679
sea_level_pressure    1018383
wind_direction         678715
wind_speed              66795
dtype: int64

In [15]:
# chnage to time sta
meterOneDataLOaded['timestamp'] =  pd.to_datetime(meterOneDataLOaded['timestamp'])


In [16]:
# check the unique time stamp present
pd.DatetimeIndex(meterOneDataLOaded['timestamp']).year.unique()

Int64Index([2016], dtype='int64', name='timestamp')

In [17]:
meterOneDataLOaded.dtypes

building_id                    int64
meter                          int64
timestamp             datetime64[ns]
meter_reading                float64
site_id                        int64
primary_use                   object
square_feet                    int64
year_built                   float64
floor_count                  float64
air_temperature              float64
cloud_coverage               float64
dew_temperature              float64
precip_depth_1_hr            float64
sea_level_pressure           float64
wind_direction               float64
wind_speed                   float64
dtype: object

In [18]:
# choose windpseed because it has fewer null values
meterOneDataLOaded.isnull().sum()

building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
year_built            6470035
floor_count           9096083
air_temperature         47325
cloud_coverage        5329652
dew_temperature         49091
precip_depth_1_hr     2513679
sea_level_pressure    1018383
wind_direction         678715
wind_speed              66795
dtype: int64

In [19]:
# set time stamp as index 
meterOneDataLOaded.set_index('timestamp',inplace=True);


In [20]:
meterOneDataLOaded['dew_temperature'][1:2]

timestamp
2016-01-01    20.0
Name: dew_temperature, dtype: float64

#### Model 


In [59]:
# a subset of the data has been copied to use to test the visualizer function
# use the full data set if space is available and system is fast

train_test = meterOneDataLOaded[['meter_reading', 'square_feet', 'air_temperature','primary_use','site_id','dew_temperature']].copy()


In [60]:

# reset the index so that time is no longer the index
# the index is now  numbers 
train_test.reset_index(level=0, inplace=True)


In [61]:
# only 2016 data is represented here as usual
pd.DatetimeIndex(train_test['timestamp']).year.unique()

Int64Index([2016], dtype='int64', name='timestamp')

In [62]:
# extrac year month and day
train_test['year'] = pd.DatetimeIndex(train_test['timestamp']).year
train_test['month'] = pd.DatetimeIndex(train_test['timestamp']).month
train_test['day'] = pd.DatetimeIndex(train_test['timestamp']).day
train_test['hour'] = pd.DatetimeIndex(train_test['timestamp']).hour

In [63]:
# check null values
train_test.isnull().sum()

timestamp              0
meter_reading          0
square_feet            0
air_temperature    47325
primary_use            0
site_id                0
dew_temperature    49091
year                   0
month                  0
day                    0
hour                   0
dtype: int64

In [64]:
# check data types
train_test.dtypes

timestamp          datetime64[ns]
meter_reading             float64
square_feet                 int64
air_temperature           float64
primary_use                object
site_id                     int64
dew_temperature           float64
year                        int64
month                       int64
day                         int64
hour                        int64
dtype: object

In [65]:
# forward filling missing values since values from previous timestamp should
# ideally be similar to the next one. (temperature today and tomorrow should be quite similar)
train_test['air_temperature'].fillna(method='ffill', inplace=True)
train_test['dew_temperature'].fillna(method='ffill', inplace=True)

In [66]:
# delete year we wont use the year information in the model
del train_test['year']

In [67]:
# convert square feet to float
train_test['square_feet'] = train_test['square_feet'].astype(float)


In [68]:
# encode site id and primary use
train_test = pd.get_dummies(train_test, columns=["primary_use","site_id"])

In [69]:
train_test.head()

Unnamed: 0,timestamp,meter_reading,square_feet,air_temperature,dew_temperature,month,day,hour,primary_use_Education,primary_use_Entertainment/public assembly,...,site_id_6,site_id_7,site_id_8,site_id_9,site_id_10,site_id_11,site_id_12,site_id_13,site_id_14,site_id_15
0,2016-01-01,0.0,7432.0,25.0,20.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2016-01-01,0.0,2720.0,25.0,20.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2016-01-01,0.0,5376.0,25.0,20.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2016-01-01,0.0,23685.0,25.0,20.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2016-01-01,0.0,116607.0,25.0,20.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# number of zeros
train_test[train_test['meter_reading'] <= 0.]['meter_reading'].count() 

530169

In [70]:
# shape before 0 removed
train_test.shape

(12060910, 40)

In [71]:
# remove zero meter readings 
# to be changed later
train_test = train_test[train_test.meter_reading != 0]

In [72]:
# shape after zero removed
train_test.shape

(11530741, 40)

In [78]:
# set time as index
train_test.set_index('timestamp',inplace=True);


In [74]:
# no zeros in data set
train_test[train_test['meter_reading'] <= 0.]['meter_reading'].count() 

0

In [80]:
# shape after zeros removed
train_test.reset_index(inplace=True)

In [81]:
cleanedpath= BASE_DIR + cfg['cleanedconfig']['cleanedDataV2']['cleanedDir'] #+cfg['rawdatapath']['validationDataName']

In [None]:
train_test

In [82]:
# save data as feather 
train_test.to_feather(cleanedpath +  cfg['cleanedconfig']['cleanedDataV2']['cleanedName'])