In [1]:
import itertools
import numpy as np
import pandas as pd
import sys
import os
from datetime import timedelta

from preprocess import read_and_preprocess_data

In [2]:
in_file = "challenge/data/device_activations.csv"
BATCH_SIZE = 16

In [3]:
feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch_size=BATCH_SIZE)
print(feature_batch[0])
print(feature_batch[1])
print(feature_batch[-1])
print("Feature batch: ", feature_batch.shape)
print("Label batch: ", label_batch.shape)

File challenge/data/device_activations.csv has 1477 timesteps (hours) until now
initial features shape:  (1477, 9)
Full sequence length:  1400
Sequence 0 has start index 0 and end index 1400
(1400, 9)
Sequence 1 has start index 4 and end index 1404
(1400, 9)
Sequence 2 has start index 8 and end index 1408
(1400, 9)
Sequence 3 has start index 12 and end index 1412
(1400, 9)
Sequence 4 has start index 16 and end index 1416
(1400, 9)
Sequence 5 has start index 20 and end index 1420
(1400, 9)
Sequence 6 has start index 24 and end index 1424
(1400, 9)
Sequence 7 has start index 28 and end index 1428
(1400, 9)
Sequence 8 has start index 32 and end index 1432
(1400, 9)
Sequence 9 has start index 36 and end index 1436
(1400, 9)
Sequence 10 has start index 40 and end index 1440
(1400, 9)
Sequence 11 has start index 44 and end index 1444
(1400, 9)
Sequence 12 has start index 48 and end index 1448
(1400, 9)
Sequence 13 has start index 52 and end index 1452
(1400, 9)
Sequence 14 has start index 56

In [4]:
train_ratio = 0.9
train_len = int(train_ratio * len(feature_batch) // BATCH_SIZE) * BATCH_SIZE
print("Train len: ", train_len)

train_feature_batch = feature_batch[:train_len]
test_feature_batch = feature_batch[train_len:]
train_label_batch = label_batch[:train_len]
test_label_batch = label_batch[train_len:]

print(train_feature_batch.shape)
print(test_feature_batch.shape)

Train len:  1008
(1008, 20, 9)
(112, 20, 9)


In [5]:
def calc_ratio_positive_outputs_per_device(labels):
    ratio_per_device = np.sum(labels, axis=0) / labels.shape[0]
    print("Percentage of positive outputs per device: ", ratio_per_device)
    return np.array(ratio_per_device)
ratio_positive_outputs_per_device = calc_ratio_positive_outputs_per_device(label_batch.reshape([-1, label_batch.shape[-1]]))

Percentage of positive outputs per device:  [0.1409375  0.25339286 0.14901786 0.17897321 0.19816964 0.32875
 0.046875  ]


In [6]:
import matplotlib.pyplot as plt
import math
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras import objectives
from keras import backend as K

from sklearn.metrics import mean_squared_error
from math import sqrt

from bayes_opt import BayesianOptimization

Using TensorFlow backend.


In [7]:
USE_WEIGHTED_LOSS = True

In [8]:
# Create our own weighted loss to combat label imbalance
def weighted_loss(y_true, y_pred):
    out = -(y_true * K.log(y_pred + 1e-5) / ratio_positive_outputs_per_device + (1.0 - y_true) * K.log(1.0 - y_pred + 1e-5))
    return K.mean(out, axis=-1)

In [9]:
def create_model(params):
    model = Sequential()
    model.add(LSTM(params['lstm_units'], batch_input_shape=(params['batch_size'], None, params['n_features']), return_sequences=True, stateful=True))
    model.add(Dropout(params['dropout']))
    model.add(Dense(params['n_outputs'], activation='sigmoid'))
    model.compile(loss=weighted_loss if params['use_weighted_loss'] else 'binary_crossentropy', optimizer=params['optimizer'])
    return model

training_params = {'optimizer': 'adam', 
                   'use_weighted_loss': USE_WEIGHTED_LOSS,
                   'batch_size': BATCH_SIZE,
                   'dropout': 0.0,
                   'epochs': 50,
                   'n_outputs': len(device_list),
                   'n_features': feature_batch.shape[-1],
                   'lstm_units': 32}

In [10]:
test_feature_batch_flattened = test_feature_batch.reshape([-1, *test_feature_batch.shape[-1:]])
test_label_batch_flattened = test_label_batch.reshape([-1, *test_label_batch.shape[-1:]])
test_feature_batch_expanded = test_feature_batch_flattened if len(test_feature_batch_flattened.shape) == 3 else np.expand_dims(test_feature_batch_flattened, axis=1)

In [11]:
def eval_model_params(params, train_X, train_Y, test_X, test_Y):
    model = create_model(training_params)
    history = model.fit(train_X, train_Y, validation_data=(test_X, test_Y), epochs=params['epochs'], batch_size=params['batch_size'], verbose=1, shuffle=False)
    return model, history.history['val_loss'][-1]


print(training_params['batch_size'])
print(train_feature_batch.shape)
model, result = eval_model_params(training_params, train_feature_batch, train_label_batch, test_feature_batch, test_label_batch)
print(result)

16
(1008, 20, 9)
Train on 1008 samples, validate on 112 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
0.2594319986445563


In [12]:
model.reset_states()
print(test_feature_batch_expanded.shape)

predictions = model.predict(test_feature_batch_expanded, batch_size=BATCH_SIZE)
predictions = np.squeeze(predictions)

print(predictions.shape)
print(test_label_batch_flattened.shape)

print(np.round(predictions)[:-24])
print(test_label_batch_flattened[:-24])

(2240, 1, 9)
(2240, 7)
(2240, 7)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 1. 1. 0.]
 [1. 1. 1. ... 1. 1. 0.]
 [0. 1. 1. ... 1. 1. 0.]]


In [13]:
print("Val accuracy: ", np.sum(np.round(predictions) == test_label_batch_flattened) / predictions.size) 
print("Val accuracy per device:: ", np.sum(np.round(predictions) == test_label_batch_flattened, axis=0) / predictions.shape[0]) 

print("% of 1 prediction outputs", np.sum(np.round(predictions)) / predictions.size) 
print("% of 1 label outputs", np.sum(np.round(test_label_batch_flattened)) / test_label_batch_flattened.size)

Val accuracy:  0.8635204081632653
Val accuracy per device::  [0.81116071 0.78035714 0.80446429 0.90625    0.92410714 0.84732143
 0.97098214]
% of 1 prediction outputs 0.15803571428571428
% of 1 label outputs 0.17308673469387756


In [14]:
model.save('model.h5')
import json

with open('params.json', 'w') as fp:
    json.dump(training_params, fp)

In [15]:
import keras.losses
keras.losses.weighted_loss = weighted_loss

test_model_params = dict(training_params)
test_model_params['batch_size'] = 1

#test_model = load_model("model.h5)
test_model = create_model(test_model_params)
test_model.load_weights('model.h5')

In [16]:
def test(model, in_file):
    test_features, test_labels, device_list = read_and_preprocess_data(in_file, batch_size=1)
    print(feature_batch.shape)
    print(label_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    print(np.round(predictions))
    print(label_batch)

In [17]:
test(test_model, in_file)

File challenge/data/device_activations.csv has 1477 timesteps (hours) until now
(1120, 20, 9)
(1120, 20, 7)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 1. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 1. ... 0. 0. 0.]
  [0. 1. 1. ... 0. 0. 0.]
  [0. 1. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 1. ... 0. 0. 0.]
  [0. 1. 1. ... 0. 0. 0.]
  [0. 1. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 1. 1. ... 1. 1. 0.]
  [1. 1. 1. ... 1. 1. 0.]
  [0. 1. 1. ... 1. 1. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 

In [18]:
def predict_next_24h(model, in_file):
    feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch_size=1)
    print(feature_batch.shape)
    print(label_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    
    all_predictions = []
    
    last_features = feature_batch[-1, -1]
    last_predictions = tmp_prediction = predictions[-1]
    
    tmp_features = np.array(last_features)
    tmp_features = np.concatenate([tmp_features[:2], last_predictions])
    for i in range(24):
        print(tmp_features)
        #print(tmp_prediction)
        tmp_prediction = model.predict(np.reshape(tmp_features, [1, 1, len(tmp_features)]))
        tmp_features = np.concatenate([tmp_features[:2], tmp_prediction[0, 0]])
        
        # Increment time features
        if tmp_features[1] == 23:
            tmp_features[0] = (tmp_features[0] + 1) % 7
        tmp_features[1] = (tmp_features[1] + 1) % 24
        all_predictions += [tmp_prediction]
        
    return np.round(np.concatenate(all_predictions))

In [19]:
test_file = "challenge/data/device_activations_medium.csv"
future_predictions = predict_next_24h(test_model, test_file)

File challenge/data/device_activations_medium.csv has 316 timesteps (hours) until now
(1, 316, 8)
(1, 316, 6)


ValueError: Error when checking input: expected lstm_2_input to have shape (None, 9) but got array with shape (316, 8)

In [20]:
label_file = "challenge/data/device_activations_small.csv"
test_model.reset_states()
feature_batch, label_batch, device_list = read_and_preprocess_data(label_file, batch_size=1)
label_batch = label_batch.squeeze()
print(label_batch.shape)
print(future_predictions.shape)
future_predictions = np.squeeze(future_predictions.astype(np.int64))
print(future_predictions)
print(label_batch[-24:])
future_labels = label_batch[-24:]

File challenge/data/device_activations_small.csv has 125 timesteps (hours) until now
(125, 6)


NameError: name 'future_predictions' is not defined

In [None]:
print(future_predictions.shape)
print(future_labels.shape)

print("Test accuracy: ", np.sum(np.round(future_predictions) == future_labels) / future_labels.size) 
print("Test accuracy per device:: ", np.sum(np.round(future_predictions) == future_labels, axis=0) / future_labels.shape[0]) 


In [None]:
# Sandbox
n = 128
n_sequences = 4
sequence_length = 16
a = np.arange(256)
n_minibatches = n // sequence_length // n_sequences
b = sequences = np.reshape(a, [n_sequences, n_minibatches, sequence_length, 2])
print(b)
print(b.shape)
sequence_shift = n // sequence_length

mini_batch_features_arr_shape = [n_minibatches * n_sequences, sequence_length, 2]
mini_batch_features = np.zeros(mini_batch_features_arr_shape)
mini_batch_labels_arr_shape = [n_minibatches * n_sequences, sequence_length, 2]
mini_batch_labels = np.zeros(mini_batch_labels_arr_shape)
for i in range(n_minibatches):
    for j in range(n_sequences):
        mini_batch_features[i * n_sequences + j] = b[j, i]
        mini_batch_labels[i * n_sequences + j] = b[j, i]
        print(i * n_sequences + j)
        
print(mini_batch_features)

In [None]:
indexes=[]
for x in range(mini_batch_count):
    for i in range(batch_size):
        for j in range(sequence_length):
            indexes += [i * sequence_shift + x * sequence_length + j]
print(np.reshape(indexes, [mini_batch_count, batch_size, sequence_length]))