In [2]:
# for data frame analysis
import pandas as pd 

# for mathematical operations
import numpy as np 

# imports below are for plotly 
import ipywidgets as widgets
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)   # for offline mode use
import plotly.figure_factory as ff
import plotly.offline as offline


# matplotlib library for plotting
import matplotlib.pyplot as plt
%matplotlib inline

# For Normalizing data
from sklearn.preprocessing import StandardScaler

# For statistical test
import scipy.stats as stats

# Split data set into training and test set
from sklearn.model_selection import train_test_split as tts

# SVN module
from sklearn import svm

# Kernel Functions used 
from sklearn.metrics.pairwise import rbf_kernel,laplacian_kernel

# module for chi square test
from scipy.stats import chisquare


# For dictionary 
from collections import defaultdict

# for use of tensorflow
import tensorflow as tf
#from tensorflow.nn.rnn import *
from tensorflow.python.ops  import *

# for scaling arrays
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler

import random

# config file
import yaml
import os

# auto reload
%load_ext autoreload

%autoreload 2
# for random sampling of validation set
import random

In [67]:
################################################################################
# load config file
try:
    with open("../config.yml", 'r') as ymlfile:
            cfg = yaml.safe_load(ymlfile)
except (IOError):
    print('config file is required. Put config file in current directory')
################################################################################

In [38]:
# current working directory
cwd = os.getcwd()
# set the base directory. base directo
BASE_DIR = os.path.join( os.path.dirname( cwd), '' )
# test path
testpath= BASE_DIR + cfg['cleanedtest']['TargetDir'] #+cfg['rawdatapath']['validationDataName']

In [4]:
# I will Load only the zero meter data set 
# same syntax applied for other data set
#meterOneDataLOaded= pd.read_csv('/Users/maya/Downloads/meterZeroTestData.csv')
testdata= pd.read_feather('leak.feather')
# all meter types data set could be analysed in same manner

In [9]:
testdata.head() # iew data 

Unnamed: 0,building_id,meter,meter_reading,timestamp
0,0,0.0,0.0,2016-01-01
1,1,0.0,0.0,2016-01-01
2,2,0.0,0.0,2016-01-01
3,3,0.0,0.0,2016-01-01
4,4,0.0,0.0,2016-01-01


In [21]:
testdata.dtypes

building_id               int64
meter                   float64
meter_reading           float64
timestamp        datetime64[ns]
year                      int64
dtype: object

In [5]:
# extract the year and later drop 2016 because it is in train data
testdata['year'] = pd.DatetimeIndex(testdata['timestamp']).year

In [6]:
# test data cleaned to extract 2017 and 2018
cleantest = pd.DataFrame(testdata.loc[testdata['year'].isin([2017,2018]) ])


In [7]:
# extraxt for meter zero
cleantest2 =  pd.DataFrame(cleantest.loc[cleantest['meter']==0.0 ])

In [8]:
# clean up cleantest 
del cleantest

In [9]:
# confirm the year is 17 and 18
cleantest2.year.unique()

array([2017, 2018])

In [10]:
# confirm only meter zero is present 
cleantest2.meter.unique()

array([0.])

In [11]:
# read building data
building = pd.read_csv('building_metadata.csv')

In [12]:
# merge with building data set
cleantest2 = cleantest2.merge(building, on = 'building_id', how = 'left')

In [13]:
# weather data for test data
weathertest = pd.read_csv('weather_test.csv')
# change weather timestamp from type object to date time 
weathertest['timestamp'] =  pd.to_datetime(weathertest['timestamp'])

In [14]:
# merge weather data to the test data
cleantest2 = cleantest2.merge(weathertest, on = ['site_id', 'timestamp'], how = 'left')

In [15]:
# column values 
cleantest2.columns.tolist()

['building_id',
 'meter',
 'meter_reading',
 'timestamp',
 'year',
 'site_id',
 'primary_use',
 'square_feet',
 'year_built',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'precip_depth_1_hr',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed']

In [16]:
# store column as list
columns = cleantest2.columns.tolist()

In [17]:
# check the unique time stamp present
pd.DatetimeIndex(cleantest2['timestamp']).year.unique()

Int64Index([2017, 2018], dtype='int64', name='timestamp')

In [35]:
# choose windpseed because it has fewer null values
# dont rerun this , takes time
cleantest2.isnull().sum()

building_id                 0
meter                       0
meter_reading               0
timestamp                   0
year                        0
site_id                     0
primary_use                 0
square_feet                 0
year_built            1046356
floor_count           5217517
air_temperature         32489
cloud_coverage        3472356
dew_temperature         33058
precip_depth_1_hr     1690572
sea_level_pressure     136913
wind_direction         282431
wind_speed              32923
dtype: int64

In [37]:
# check data
cleantest2['dew_temperature'][1:2]

timestamp
2017-01-01    11.7
Name: dew_temperature, dtype: float64

In [18]:
# test data

finaltest = cleantest2[['timestamp','building_id','meter_reading','square_feet', 'air_temperature','primary_use','site_id','dew_temperature']].copy()



In [19]:
# del clean test 2
del cleantest2

In [45]:

# reset the index so that time is no longer the index
# the index is now  numbers 
#finaltest.reset_index(level=0, inplace=True)



In [20]:
# extrac year month and day

finaltest['month'] = pd.DatetimeIndex(finaltest['timestamp']).month
finaltest['day'] = pd.DatetimeIndex(finaltest['timestamp']).day
finaltest['hour'] = pd.DatetimeIndex(finaltest['timestamp']).hour

In [21]:
# check null values
finaltest.isnull().sum()

timestamp              0
building_id            0
meter_reading          0
square_feet            0
air_temperature    32489
primary_use            0
site_id                0
dew_temperature    33058
month                  0
day                    0
hour                   0
dtype: int64

In [22]:
# check data types
finaltest.dtypes

timestamp          datetime64[ns]
building_id                 int64
meter_reading             float64
square_feet                 int64
air_temperature           float64
primary_use                object
site_id                     int64
dew_temperature           float64
month                       int64
day                         int64
hour                        int64
dtype: object

##### Data Artefacts Added

Since there are fewer site ids this will cause the feature or columns to be
lesser.
In effect one needs fake data but the fake data will be skipped during prediction
of values. That is why they have been assined values of -400.


In [24]:
# the site ids to be added
sites_id_not_in_data =  [3,5,6,7,8,9,10,11,12,13,14]

# the time stamp to be used 
year =2019
month = 1 # january
day = 1
hour=2
# loop through add data to  index , reindex and continue
for i in sites_id_not_in_data:
    finaltest.loc[-1] = [ pd.Timestamp(year, month, day, 12),2, 0.,-4000000.,0.,'Education',i,0.,month,day,hour]  # adding a row
    finaltest.index = finaltest.index + 1  # shifting index
    finaltest = finaltest.sort_index()  # sorting by index
    day +=1

In [25]:
# shape after appending
finaltest.shape

(7512486, 11)

In [26]:
# all site id now present
finaltest.site_id.unique()

array([14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  3,  0,  1,  2,  4, 15])

In [16]:
#finaltest = pd.read_csv('zerodaytest.csv')

In [27]:
# in the primary use manufacturing and services are absent
finaltest.primary_use.unique()

array(['Education', 'Lodging/residential', 'Office',
       'Entertainment/public assembly', 'Other', 'Retail', 'Parking',
       'Public services', 'Warehouse/storage', 'Food sales and service',
       'Religious worship', 'Healthcare', 'Utility', 'Technology/science'],
      dtype=object)

In [29]:
pry_id_use_in_data =  ['Manufacturing/industrial','Services']

# the time stamp to be used 
year =2019
month = 1 # january
day = 19
hour=2
# loop through add data to  index , reindex and continue
for i in pry_id_use_in_data:
    finaltest.loc[-1] = [ pd.Timestamp(year, month, day, 12),2 ,0.,-4000000.,0.,i,15,0.,month,day,hour]  # adding a row
    finaltest.index = finaltest.index + 1  # shifting index
    finaltest = finaltest.sort_index()  # sorting by index
    day +=1

In [52]:
# set time as index
#finaltest.set_index('timestamp',inplace=True);

In [54]:
# save as csv
#finaltest.to_csv('zerodaytest.csv')


In [30]:
finaltest.columns

Index(['timestamp', 'building_id', 'meter_reading', 'square_feet',
       'air_temperature', 'primary_use', 'site_id', 'dew_temperature', 'month',
       'day', 'hour'],
      dtype='object')

In [31]:
# convert to dummies
finaltest = pd.get_dummies(finaltest, columns=["primary_use","site_id"])

In [57]:
# shape is now correct 
# delete the meter_reading since it is the target
finaltest.shape

(7512488, 38)

## Final data Storage

In [48]:
# forward filling missing values since values from previous timestamp should
# ideally be similar to the next one. (temperature today and tomorrow should be quite similar)
finaltest['air_temperature'].fillna(method='ffill', inplace=True)
finaltest['dew_temperature'].fillna(method='ffill', inplace=True)

In [49]:
# remove artefact values
finaltest = finaltest[finaltest.square_feet !=-4000000.]

In [50]:
# get true meter readings, building id, timestamp
test_readings = finaltest[['meter_reading','building_id','timestamp']].copy(deep=True)


In [51]:
# save data to path
test_readings.to_csv(testpath +  cfg['cleanedtest']['testDatasetName'])

In [56]:
#building test readings
del test_readings

In [57]:
#building building id from finnaltest
del finaltest['building_id']

In [61]:
# re index the dataset
finaltest.reset_index(inplace=True)

In [62]:
# save test data
finaltest.to_feather(testpath +  cfg['cleanedtest']['testFeatureSet'])

# Special cleaning for LSTM Model

In [63]:
# delete timestamp and meter readings
del finaltest['timestamp']
del finaltest['meter_reading']

In [64]:
def Standardize(array):
    '''
    Standardize an array along eachcolumn (each feature that is)
    '''
    transformer = MaxAbsScaler().fit(array)
    output = transformer.transform(array)
    return  np.array(output)

In [65]:
# standardize data
lstmtest =  Standardize(finaltest)




In [71]:

np.save(testpath + cfg['cleanedtest']['LSTMtestFeatureSet'] , lstmtest) # save
#new_num_arr = np.load('data.npy') # load