In [1]:
import numpy as np
import pandas as pd

import keras
from keras.preprocessing.sequence import TimeseriesGenerator

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
train = pd.read_csv("gs://123test_bucket/train.csv")

#### Scaling by np.log1p

In [3]:
train['meter_reading'] = np.log1p(train['meter_reading'])

#### Applying TimeseriesGenerator to the ASHRAE training data

Once difference between the example above, and our situation is that we have multiple timeseries, for each building and each meter in the building. So, we will be required to modify the code a little bit.

Below, we check how many meters exist in the dataset.

In [4]:
len(train[['building_id', 'meter']].drop_duplicates())

2380

The code below has been taken from this stackoverflow answer with some modifications:
https://stackoverflow.com/questions/55116638/use-keras-timeseriesgenerator-function-to-generate-squence-group-by-some-id/55118459#55118459

The modification is basically that once we subset the data for building ID, it is then subset for meter type also.

Further reading about modifying keras generator classes can be found below:
https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

In [5]:
# https://stackoverflow.com/questions/55116638/use-keras-timeseriesgenerator-function-to-generate-squence-group-by-some-id/55118459#55118459
# https://keras.io/preprocessing/sequence/
class DataGenerator(keras.utils.Sequence):
    def __init__(self, dt, length = 168, batch_size = 10):
        self.tgs = list()
        for i in range(dt['building_id'].min(),dt['building_id'].max()+1):
            sub = dt.loc[dt['building_id'] == i, ['meter', 'meter_reading']]
            for meter in sub['meter'].unique():
                # subsetting sub for meter type
                adf = sub.loc[sub['meter'] == meter, 'meter_reading']
                self.tgs.append(TimeseriesGenerator(adf.values,adf.values,length,batch_size=batch_size))
        self.len = sum([len(tg) for tg in self.tgs])
        self.idx_i = list()
        self.idx_j = list()

        for i, tg in enumerate(self.tgs):
            self.idx_i.extend(list(range(len(tg))))
            self.idx_j.extend([i]*len(tg))    
        #print ( self.idx_i,  self.idx_j)

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        return self.tgs[self.idx_j[index]][self.idx_i[index]]



Based on a manual check it was found that there were 12 unique meters in the train_sub dataset.

In our case we want to use 24*7 timesteps, representing 7 days and 24 hours. We can experiment with the batch size but using 20 here for a short example.

#### Training and Validation Generators

Using 30% of the data as validation data. If more data is needed, we should consider adding data by randomly selecting buildings.

In [6]:
ids = train['building_id'].unique()

In [43]:
# keeping only 20% of the ids
len_sub = round(len(ids)*0.05)
ids_sub = np.random.choice(ids, len_sub, replace = False)

In [44]:
# 0.3 represents the percentage of data that is kept for validation
len_val = round(len(ids_sub)*0.3)

In [45]:
ids_val = np.random.choice(ids_sub, len_val, replace = False)

In [46]:
ids_train = np.setdiff1d(ids_sub,ids_val)

In [47]:
assert len(ids_val)+len(ids_train)==len(ids_sub)

In [48]:
# Test
length = 24*7
batch_size = 10
train_gen = DataGenerator(train[train['building_id'].isin(ids_train)],length, batch_size = batch_size)
val_gen = DataGenerator(train[train['building_id'].isin(ids_val)],length, batch_size = batch_size)
                                

### CNN-1D Model Creation

In [49]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

Following the tutorial here:
https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/

In [50]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=168, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))

In [51]:
# compile the keras model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [55]:
# Train model on dataset
model.fit_generator(generator=train_gen,
                    validation_data=val_gen,# epochs=5,
                    use_multiprocessing=True,
                    workers=6)


Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f1ab9561518>