In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

#import dask.dataframe as dd

from datetime import datetime

In [2]:
import keras 

from keras.preprocessing import sequence
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation, LeakyReLU
from keras.layers import Embedding
from keras.layers import Concatenate, Flatten, Reshape, Lambda
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, LSTM
from keras.utils import plot_model
from keras import backend as K

from keras.datasets import imdb

Using TensorFlow backend.


In [3]:
train = pd.read_csv("gs://123test_bucket/train.csv")

In [4]:
reading = train['meter_reading']
train['meter_reading'] = np.log1p(train['meter_reading'])

scaler1 = MinMaxScaler()
train['meter_reading'] = scaler1.fit_transform(X=np.reshape(train['meter_reading'].values, (-1, 1))).reshape(len(train),)

In [5]:
weather_train = pd.read_csv("gs://123test_bucket/weather_train.csv")
meta = pd.read_csv("gs://123test_bucket/building_metadata.csv")

train.timestamp = pd.to_datetime(train.timestamp)
weather_train.timestamp = pd.to_datetime(weather_train.timestamp)

weather_train['month'] = weather_train.timestamp.dt.month
weather_train['dayofweek'] = weather_train.timestamp.dt.dayofweek
weather_train['hour'] = weather_train.timestamp.dt.hour

train_meta_w = pd.merge(weather_train, meta, on='site_id')

train_meta_w = train_meta_w.fillna(0)

train = pd.merge(train, train_meta_w, on=['building_id', 'timestamp'], how='inner') # we have enough training data
train_meta_w = None

In [6]:
enc = OneHotEncoder(drop='first', sparse=False)
cat_cols = ['site_id', 'hour', 'dayofweek', 'month', 'primary_use', 'year_built']
other_cols = ['building_id', 'meter']
enc.fit(train[cat_cols + other_cols])

OneHotEncoder(categorical_features=None, categories=None, drop='first',
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

In [7]:
enc.categories_

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23]),
 array([0, 1, 2, 3, 4, 5, 6]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
 array(['Education', 'Entertainment/public assembly',
        'Food sales and service', 'Healthcare', 'Lodging/residential',
        'Manufacturing/industrial', 'Office', 'Other', 'Parking',
        'Public services', 'Religious worship', 'Retail', 'Services',
        'Technology/science', 'Utility', 'Warehouse/storage'], dtype=object),
 array([   0., 1900., 1902., 1903., 1904., 1905., 1906., 1907., 1908.,
        1909., 1910., 1911., 1912., 1913., 1914., 1915., 1916., 1917.,
        1918., 1919., 1920., 1921., 1922., 1923., 1924., 1925., 1926.,
        1927., 1928., 1929., 1930., 1931., 1932., 1933., 1934., 1935.,
        1936., 1937., 1938., 1939., 1940., 1941., 1942., 1944., 1945.,
        1946., 1947., 1948., 

In [17]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, dt, num_cols, cat_names, new_cat_names, batch_size = 10):
        self.batch_size = batch_size
        self.len = len(dt)//batch_size
        self.dt = dt[num_cols + cat_names + ['meter_reading']]
        self.num_cols = num_cols
        self.cat_names = cat_names
        self.new_cat_names = new_cat_names

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        index_start = index*self.batch_size
        index_end = index_start+self.batch_size
        adf = self.dt[index_start:index_end]
        cat_vals = enc.transform(adf[self.cat_names])
        return np.concatenate([adf[self.num_cols].values, cat_vals], axis =1), adf['meter_reading']

In [9]:
num_cols = ['air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'square_feet', 'floor_count']

cat_names = cat_cols + other_cols
new_cat_names = list(enc.get_feature_names(cat_names))
col_names = num_cols + new_cat_names


In [10]:
np.concatenate([train.head()[num_cols].values, train.head()[cat_names].values], axis=1) 

array([[25.0, 6.0, 20.0, 0.0, 1019.7, 0.0, 0.0, 7432.0, 0.0, 0, 0, 4, 1,
        'Education', 2008.0, 0, 0],
       [25.0, 6.0, 20.0, 0.0, 1019.7, 0.0, 0.0, 2720.0, 0.0, 0, 0, 4, 1,
        'Education', 2004.0, 1, 0],
       [25.0, 6.0, 20.0, 0.0, 1019.7, 0.0, 0.0, 5376.0, 0.0, 0, 0, 4, 1,
        'Education', 1991.0, 2, 0],
       [25.0, 6.0, 20.0, 0.0, 1019.7, 0.0, 0.0, 23685.0, 0.0, 0, 0, 4, 1,
        'Education', 2002.0, 3, 0],
       [25.0, 6.0, 20.0, 0.0, 1019.7, 0.0, 0.0, 116607.0, 0.0, 0, 0, 4,
        1, 'Education', 1975.0, 4, 0]], dtype=object)

In [11]:
ids = train['building_id'].unique()
len_val = round(len(ids)*0.3)
ids_val = np.random.choice(ids, len_val, replace = False)

In [18]:
batch_size = 1024#1024+512
train_gen = DataGenerator(train, 
                          num_cols=num_cols, 
                          cat_names=cat_names,
                          new_cat_names = new_cat_names,
                          batch_size=batch_size)
val_gen = DataGenerator(train[train['building_id'].isin(ids_val)], 
                          num_cols=num_cols, 
                          cat_names=cat_names,
                          new_cat_names = new_cat_names,
                          batch_size=batch_size)

In [20]:
train_gen[0][0].shape

(1024, 1646)

In [21]:
dum = train.head(20)

In [22]:
enc.transform(dum[cat_names])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
enc.get_feature_names(cat_names)

array(['site_id_1', 'site_id_2', 'site_id_3', ..., 'meter_1', 'meter_2',
       'meter_3'], dtype=object)

In [24]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [34]:
#model = Sequential()
filters = 10
ncols = train_gen[0][0].shape[1] # categorical columns
states = 20
act = 'linear'

input_shape1 = Input(shape=(ncols,))

m1 = Dense(200, activation=act)(input_shape1)
m1 = Dropout(0.4)(m1)
m1 = Dense(10, activation=act)(m1)
m1 = LeakyReLU(alpha=0.1)(m1)
m1 = Dropout(0.2)(m1)
m1 = Dense(1, activation='sigmoid')(m1)

model2 = Model(inputs = input_shape1, outputs = m1)
model2._make_predict_function()

opt = keras.optimizers.RMSprop(clipnorm=1.)

model2.compile(loss=root_mean_squared_error, optimizer=opt, metrics=['mse', 'mae', 'mape'])

In [35]:
# Train model on dataset
epochs = 1#TODO: change it back to 5
workers = 10
model2.fit_generator(generator=train_gen,
                    validation_data=val_gen, epochs=epochs, 
                    use_multiprocessing=True, workers = workers)

Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f56042feb38>

In [None]:
# One hot encoding: No improvment
# First layer with 200 nodes, epoch1=0.2754
# First layer with 400 nodes, epoch1=0.2753
# First layer with 200 nodes followed by Dropout(0.4), epoch1=0.2756
# 
