In [1]:
import itertools
import numpy as np
import pandas as pd
import sys
import os
from datetime import timedelta

from preprocess import read_and_preprocess_data

In [2]:
!python -m pip install pandas



In [3]:
in_file = "challenge/data/device_activations_medium.csv"
BATCH_SIZE = 16

In [4]:
feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch_size=BATCH_SIZE)
print("Feature batch: ", feature_batch.shape)
print("Label batch: ", label_batch.shape)

File challenge/data/device_activations_medium.csv has 316 timesteps (hours)
initial features shape:  (316, 8)
Full sequence length:  240
Sequence 0 has start index 0 and end index 240
(240, 8)
Sequence 1 has start index 4 and end index 244
(240, 8)
Sequence 2 has start index 8 and end index 248
(240, 8)
Sequence 3 has start index 12 and end index 252
(240, 8)
Sequence 4 has start index 16 and end index 256
(240, 8)
Sequence 5 has start index 20 and end index 260
(240, 8)
Sequence 6 has start index 24 and end index 264
(240, 8)
Sequence 7 has start index 28 and end index 268
(240, 8)
Sequence 8 has start index 32 and end index 272
(240, 8)
Sequence 9 has start index 36 and end index 276
(240, 8)
Sequence 10 has start index 40 and end index 280
(240, 8)
Sequence 11 has start index 44 and end index 284
(240, 8)
Sequence 12 has start index 48 and end index 288
(240, 8)
Sequence 13 has start index 52 and end index 292
(240, 8)
Sequence 14 has start index 56 and end index 296
(240, 8)
Sequen

In [5]:
train_ratio = 0.9
train_len = int(train_ratio * len(feature_batch) // BATCH_SIZE) * BATCH_SIZE
print("Train len: ", train_len)

train_feature_batch = feature_batch[:train_len]
test_feature_batch = feature_batch[train_len:]
train_label_batch = label_batch[:train_len]
test_label_batch = label_batch[train_len:]

print(train_feature_batch.shape)
print(test_feature_batch.shape)

Train len:  160
(160, 20, 8)
(32, 20, 8)


In [6]:
def calc_ratio_positive_outputs_per_device(labels):
    ratio_per_device = np.sum(labels, axis=0) / labels.shape[0]
    print("Percentage of positive outputs per device: ", ratio_per_device)
    return np.array(ratio_per_device)
ratio_positive_outputs_per_device = calc_ratio_positive_outputs_per_device(label_batch.reshape([-1, label_batch.shape[-1]]))

Percentage of positive outputs per device:  [0.26848958 0.19713542 0.24817708 0.22890625 0.31614583 0.1       ]


In [7]:
import matplotlib.pyplot as plt
import math
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras import objectives
from keras import backend as K

from sklearn.metrics import mean_squared_error
from math import sqrt

from bayes_opt import BayesianOptimization

Using TensorFlow backend.


In [8]:
USE_WEIGHTED_LOSS = True

In [9]:
# Create our own weighted loss to combat label imbalance
def weighted_loss(y_true, y_pred):
    out = -(y_true * K.log(y_pred + 1e-5) / ratio_positive_outputs_per_device + (1.0 - y_true) * K.log(1.0 - y_pred + 1e-5))
    return K.mean(out, axis=-1)

In [10]:
def create_model(params):
    n_outputs = len(device_list)
    model = Sequential()
    model.add(LSTM(8, batch_input_shape=(params['batch_size'], None, feature_batch.shape[-1]), return_sequences=True, stateful=True))
    model.add(Dropout(params['dropout']))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss=weighted_loss if params['use_weighted_loss'] else 'binary_crossentropy', optimizer=params['optimizer'])
    return model

training_params = {'optimizer': 'adam', 
                   'use_weighted_loss': USE_WEIGHTED_LOSS,
                   'batch_size': BATCH_SIZE,
                   'dropout': 0.0,
                   'epochs': 25}

#model = create_model(training_params)
#model.fit(train_feature_batch, train_label_batch, epochs=training_params['epochs'], batch_size=training_params['batch_size'], verbose=1, shuffle=False)

In [11]:
test_feature_batch_flattened = test_feature_batch.reshape([-1, *test_feature_batch.shape[-1:]])
test_label_batch_flattened = test_label_batch.reshape([-1, *test_label_batch.shape[-1:]])
test_feature_batch_expanded = test_feature_batch_flattened if len(test_feature_batch_flattened.shape) == 3 else np.expand_dims(test_feature_batch_flattened, axis=1)

In [12]:
def eval_model_params(params, train_X, train_Y, test_X, test_Y):
    model = create_model(training_params)
    history = model.fit(train_X, train_Y, validation_data=(test_X, test_Y), epochs=params['epochs'], batch_size=params['batch_size'], verbose=1, shuffle=False)
    return model, history.history['val_loss'][-1]

model, result = eval_model_params(training_params, train_feature_batch, train_label_batch, test_feature_batch, test_label_batch)
print(result)

Train on 160 samples, validate on 32 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
0.8844384253025055


In [14]:
predictions = model.predict(test_feature_batch_expanded, batch_size=BATCH_SIZE)
predictions = np.squeeze(predictions)

print(predictions.shape)
print(test_label_batch_flattened.shape)


(640, 6)
(640, 6)


In [15]:
print("Val accuracy: ", np.sum(np.round(predictions) == test_label_batch_flattened) / predictions.size) 
print("Val accuracy per device:: ", np.sum(np.round(predictions) == test_label_batch_flattened, axis=0) / predictions.shape[0]) 

print("% of 1 prediction outputs", np.sum(np.round(predictions)) / predictions.size) 
print("% of 1 label outputs", np.sum(np.round(test_label_batch_flattened)) / test_label_batch_flattened.size)

Val accuracy:  0.434375
Val accuracy per device::  [0.3765625 0.35      0.4578125 0.665625  0.3953125 0.3609375]
% of 1 prediction outputs 0.7942708333333334
% of 1 label outputs 0.23125


In [16]:
model.save('model.h5')

In [28]:
import keras.losses
keras.losses.weighted_loss = weighted_loss

test_model_params = dict(training_params)
test_model_params['batch_size'] = 1

#test_model = load_model("model.h5)
test_model = create_model(test_model_params)
test_model.load_weights('model.h5')

In [29]:
def test(model, in_file):
    test_features, test_labels, device_list = read_and_preprocess_data(in_file, batch_size=1)
    print(feature_batch.shape)
    print(label_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    print(np.round(predictions))
    print(np.concatenate(label_batch))

In [30]:
test(test_model, in_file)

File challenge/data/device_activations_medium.csv has 316 timesteps (hours)
(192, 20, 8)
(192, 20, 6)
[[1. 0. 0. 0. 1. 1.]
 [1. 0. 0. 0. 1. 0.]
 [1. 1. 1. 0. 1. 0.]
 ...
 [1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1.]]
[[0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 1 0 1 0]
 ...
 [1 0 0 0 0 0]
 [1 0 1 1 1 0]
 [1 0 1 1 1 0]]


In [34]:
def predict_next_24h(model, in_file):
    feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch_size=1)
    print(feature_batch.shape)
    print(label_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    
    all_predictions = []
    
    last_features = feature_batch[-1, -1]
    last_predictions = tmp_prediction = predictions[-1]
    
    tmp_features = np.array(last_features)
    tmp_features = np.concatenate([tmp_features[:2], last_predictions])
    for i in range(24):
        print(tmp_features)
        #print(tmp_prediction)
        tmp_prediction = model.predict(np.reshape(tmp_features, [1, 1, len(tmp_features)]))
        tmp_features = np.concatenate([tmp_features[:2], tmp_prediction[0, 0]])
        
        # Increment time features
        if tmp_features[1] == 23:
            tmp_features[0] = (tmp_features[0] + 1) % 7
        tmp_features[1] = (tmp_features[1] + 1) % 24
        all_predictions += [tmp_prediction]
        
    return np.round(np.concatenate(all_predictions))

In [35]:
test_file = "challenge/data/device_activations_smaller.csv"
future_predictions = predict_next_24h(test_model, test_file)

File challenge/data/device_activations_smaller.csv has 101 timesteps (hours)
(1, 101, 8)
(1, 101, 6)
[1.         8.         0.55332178 0.69843179 0.76699567 0.70377851
 0.76194447 0.6802941 ]
[1.         9.         0.57329249 0.73396206 0.78556907 0.73067397
 0.7740925  0.70703465]
[ 1.         10.          0.57637829  0.74729389  0.79342312  0.74379557
  0.7784043   0.7187081 ]
[ 1.         11.          0.57657576  0.75145435  0.79409629  0.74891824
  0.777798    0.72364622]
[ 1.         12.          0.57666266  0.75197977  0.79097897  0.75042915
  0.77467412  0.72589231]
[ 1.         13.          0.5773052   0.75040632  0.78513825  0.75009733
  0.76955068  0.72684294]
[ 1.         14.          0.58032584  0.74715793  0.77442849  0.74717814
  0.76080287  0.72820425]
[ 1.         15.          0.58450484  0.74134254  0.75970739  0.74305487
  0.7490412   0.72863734]
[ 1.         16.          0.58981699  0.73251843  0.74017614  0.73743552
  0.73362011  0.72811431]
[ 1.         17.        

In [39]:
label_file = "challenge/data/device_activations_small.csv"
test_model.reset_states()
feature_batch, label_batch, device_list = read_and_preprocess_data(label_file, batch_size=1)
label_batch = label_batch.squeeze()
print(label_batch.shape)
print(future_predictions.shape)
future_predictions = np.squeeze(future_predictions.astype(np.int64))
print(future_predictions)
print(label_batch[-24:])
future_labels = label_batch[-24:]

File challenge/data/device_activations_small.csv has 125 timesteps (hours)
(125, 6)
(24, 1, 6)
[[1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
[[0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 1 1 1 1 0]
 [1 1 1 1 1 0]
 [1 1 1 1 1 0]
 [1 1 1 1 1 1]
 [0 1 0 0 1 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 0 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]


In [40]:
print(future_predictions.shape)
print(future_labels.shape)

print("Test accuracy: ", np.sum(np.round(future_predictions) == future_labels) / future_labels.size) 
print("Test accuracy per device:: ", np.sum(np.round(future_predictions) == future_labels, axis=0) / future_labels.shape[0]) 


(24, 6)
(24, 6)
Test accuracy:  0.375
Test accuracy per device::  [0.29166667 0.41666667 0.45833333 0.41666667 0.54166667 0.125     ]


In [41]:
# Sandbox
a = np