In [1]:
import itertools
import numpy as np
import pandas as pd
import sys
import os
from datetime import timedelta

from preprocess import read_and_preprocess_data

In [2]:
in_file = "challenge/data/device_activations_small.csv"
BATCH_SIZE = 16

In [3]:
feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch_size=BATCH_SIZE)
print(feature_batch[0])
print(feature_batch[1])
print(feature_batch[-1])
print("Feature batch: ", feature_batch.shape)
print("Label batch: ", label_batch.shape)

aaaaaaaaa 2016-07-06 09:58:22
File challenge/data/device_activations_small.csv has 125 timesteps (hours) until now
initial features shape:  (125, 8)
Full sequence length:  -400
Sequence 0 has start index 0 and end index -350
(0, 8)
Sequence 1 has start index 30 and end index -320
(0, 8)
Sequence 2 has start index 60 and end index -290
(0, 8)
Sequence 3 has start index 90 and end index -260
(0, 8)
Sequence 4 has start index 120 and end index -230
(0, 8)
Sequence 5 has start index 150 and end index -200
(0, 8)
Sequence 6 has start index 180 and end index -170
(0, 8)
Sequence 7 has start index 210 and end index -140
(0, 8)
Sequence 8 has start index 240 and end index -110
(0, 8)
Sequence 9 has start index 270 and end index -80
(0, 8)
Sequence 10 has start index 300 and end index -50
(0, 8)
Sequence 11 has start index 330 and end index -20
(0, 8)
Sequence 12 has start index 360 and end index 10
(0, 8)
Sequence 13 has start index 390 and end index 40
(0, 8)
Sequence 14 has start index 420 a

ValueError: negative dimensions are not allowed

In [None]:
train_ratio = 0.9
train_len = int(train_ratio * len(feature_batch) // BATCH_SIZE) * BATCH_SIZE
print("Train len: ", train_len)

train_feature_batch = feature_batch[:train_len]
test_feature_batch = feature_batch[train_len:]
train_label_batch = label_batch[:train_len]
test_label_batch = label_batch[train_len:]

print(train_feature_batch.shape)
print(test_feature_batch.shape)

In [None]:
def calc_ratio_positive_outputs_per_device(labels):
    ratio_per_device = np.sum(labels, axis=0) / labels.shape[0]
    print("Percentage of positive outputs per device: ", ratio_per_device)
    return np.array(ratio_per_device)
ratio_positive_outputs_per_device = calc_ratio_positive_outputs_per_device(label_batch.reshape([-1, label_batch.shape[-1]]))

In [None]:
import matplotlib.pyplot as plt
import math
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras import objectives
from keras import backend as K

from sklearn.metrics import mean_squared_error
from math import sqrt

from bayes_opt import BayesianOptimization

In [None]:
USE_WEIGHTED_LOSS = True

In [None]:
# Create our own weighted loss to combat label imbalance
def weighted_loss(y_true, y_pred):
    out = -(y_true * K.log(y_pred + 1e-5) / ratio_positive_outputs_per_device + (1.0 - y_true) * K.log(1.0 - y_pred + 1e-5))
    return K.mean(out, axis=-1)

In [None]:
def create_model(params):
    model = Sequential()
    model.add(LSTM(params['lstm_units'], batch_input_shape=(params['batch_size'], None, params['n_features']), return_sequences=True, stateful=True))
    model.add(Dropout(params['dropout']))
    model.add(Dense(params['n_outputs'], activation='sigmoid'))
    model.compile(loss=weighted_loss if params['use_weighted_loss'] else 'binary_crossentropy', optimizer=params['optimizer'])
    return model

training_params = {'optimizer': 'adam', 
                   'use_weighted_loss': USE_WEIGHTED_LOSS,
                   'batch_size': BATCH_SIZE,
                   'dropout': 0.5,
                   'epochs': 50,
                   'n_outputs': len(device_list),
                   'n_features': feature_batch.shape[-1],
                   'lstm_units': 256,
                   'devices': device_list}

In [None]:
test_feature_batch_flattened = test_feature_batch.reshape([-1, *test_feature_batch.shape[-1:]])
test_label_batch_flattened = test_label_batch.reshape([-1, *test_label_batch.shape[-1:]])
test_feature_batch_expanded = test_feature_batch_flattened if len(test_feature_batch_flattened.shape) == 3 else np.expand_dims(test_feature_batch_flattened, axis=1)

In [None]:
def eval_model_params(params, train_X, train_Y, test_X, test_Y):
    model = create_model(training_params)
    history = model.fit(train_X, train_Y, validation_data=(test_X, test_Y), epochs=params['epochs'], batch_size=params['batch_size'], verbose=1, shuffle=False)
    return model, history.history['val_loss'][-1]


print(training_params['batch_size'])
print(train_feature_batch.shape)
model, result = eval_model_params(training_params, train_feature_batch, train_label_batch, test_feature_batch, test_label_batch)
print(result)

In [None]:
model.reset_states()
print(test_feature_batch_expanded.shape)

predictions = model.predict(test_feature_batch_expanded, batch_size=BATCH_SIZE)
predictions = np.squeeze(predictions)

print(predictions.shape)
print(test_label_batch_flattened.shape)

print(np.round(predictions)[:-24])
print(test_label_batch_flattened[:-24])

In [None]:
print("Val accuracy: ", np.sum(np.round(predictions) == test_label_batch_flattened) / predictions.size) 
print("Val accuracy per device:: ", np.sum(np.round(predictions) == test_label_batch_flattened, axis=0) / predictions.shape[0]) 

print("% of 1 prediction outputs", np.sum(np.round(predictions)) / predictions.size) 
print("% of 1 label outputs", np.sum(np.round(test_label_batch_flattened)) / test_label_batch_flattened.size)

In [None]:
model.save('model.h5')
import json

with open('params.json', 'w', encoding='utf-8') as fp:
    json.dump(training_params, fp)

In [None]:
import keras.losses
keras.losses.weighted_loss = weighted_loss

test_model_params = dict(training_params)
test_model_params['batch_size'] = 1

#test_model = load_model("model.h5)
test_model = create_model(test_model_params)
test_model.load_weights('model.h5')

In [None]:
def test(model, in_file):
    test_features, test_labels, device_list = read_and_preprocess_data(in_file, batch_size=1)
    print(feature_batch.shape)
    print(label_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    print(np.round(predictions))
    print(label_batch)

In [None]:
test(test_model, in_file)

In [None]:
def predict_next_24h(model, in_file):
    feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch_size=1)
    print(feature_batch.shape)
    print(label_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    
    all_predictions = []
    
    last_features = feature_batch[-1, -1]
    last_predictions = tmp_prediction = predictions[-1]
    
    tmp_features = np.array(last_features)
    tmp_features = np.concatenate([tmp_features[:2], last_predictions])
    for i in range(24):
        print(tmp_features)
        #print(tmp_prediction)
        tmp_prediction = model.predict(np.reshape(tmp_features, [1, 1, len(tmp_features)]))
        tmp_features = np.concatenate([tmp_features[:2], tmp_prediction[0, 0]])
        
        # Increment time features
        if tmp_features[1] == 23:
            tmp_features[0] = (tmp_features[0] + 1) % 7
        tmp_features[1] = (tmp_features[1] + 1) % 24
        all_predictions += [tmp_prediction]
        
    return np.round(np.concatenate(all_predictions))

In [None]:
test_file = "challenge/data/device_activations_medium.csv"
future_predictions = predict_next_24h(test_model, test_file)

In [None]:
label_file = "challenge/data/device_activations_small.csv"
test_model.reset_states()
feature_batch, label_batch, device_list = read_and_preprocess_data(label_file, batch_size=1)
label_batch = label_batch.squeeze()
print(label_batch.shape)
print(future_predictions.shape)
future_predictions = np.squeeze(future_predictions.astype(np.int64))
print(future_predictions)
print(label_batch[-24:])
future_labels = label_batch[-24:]

In [None]:
print(future_predictions.shape)
print(future_labels.shape)

print("Test accuracy: ", np.sum(np.round(future_predictions) == future_labels) / future_labels.size) 
print("Test accuracy per device:: ", np.sum(np.round(future_predictions) == future_labels, axis=0) / future_labels.shape[0]) 


In [None]:
# Sandbox
n = 128
n_sequences = 4
sequence_length = 16
a = np.arange(256)
n_minibatches = n // sequence_length // n_sequences
b = sequences = np.reshape(a, [n_sequences, n_minibatches, sequence_length, 2])
print(b)
print(b.shape)
sequence_shift = n // sequence_length

mini_batch_features_arr_shape = [n_minibatches * n_sequences, sequence_length, 2]
mini_batch_features = np.zeros(mini_batch_features_arr_shape)
mini_batch_labels_arr_shape = [n_minibatches * n_sequences, sequence_length, 2]
mini_batch_labels = np.zeros(mini_batch_labels_arr_shape)
for i in range(n_minibatches):
    for j in range(n_sequences):
        mini_batch_features[i * n_sequences + j] = b[j, i]
        mini_batch_labels[i * n_sequences + j] = b[j, i]
        print(i * n_sequences + j)
        
print(mini_batch_features)

In [None]:
indexes=[]
for x in range(mini_batch_count):
    for i in range(batch_size):
        for j in range(sequence_length):
            indexes += [i * sequence_shift + x * sequence_length + j]
print(np.reshape(indexes, [mini_batch_count, batch_size, sequence_length]))