In [1]:
import itertools
import numpy as np
import pandas as pd
import sys
import os
from datetime import timedelta

from preprocess import read_and_preprocess_data

In [2]:
!python -m pip install pandas



In [3]:
in_file = "challenge/data/device_activations.csv"

In [4]:
feature_batch, label_batch, device_list = read_and_preprocess_data(in_file)
print("Feature batch: ", feature_batch.shape)
print("Label batch: ", label_batch.shape)

n_devices =  6
Feature batch:  (6, 20, 8)
Label batch:  (6, 20, 6)


In [5]:
train_ratio = 0.8
train_len = int(train_ratio * len(feature_batch))
print("Train len: ", train_len)

train_feature_batch = feature_batch[:train_len]
test_feature_batch = feature_batch[train_len:]
train_label_batch = label_batch[:train_len]
test_label_batch = label_batch[train_len:]

print(train_feature_batch.shape)
print(test_feature_batch.shape)

Train len:  4
(4, 20, 8)
(2, 20, 8)


In [6]:
def calc_ratio_positive_outputs_per_device(labels):
    ratio_per_device = np.sum(labels, axis=0) / labels.shape[0]
    print("Percentage of positive outputs per device: ", ratio_per_device)
    return np.array(ratio_per_device)
ratio_positive_outputs_per_device = calc_ratio_positive_outputs_per_device(label_batch.reshape([-1, label_batch.shape[-1]]))

Percentage of positive outputs per device:  [0.21666667 0.23333333 0.15833333 0.13333333 0.18333333 0.05      ]


In [7]:
import matplotlib.pyplot as plt
import math
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras import objectives
from keras import backend as K

from sklearn.metrics import mean_squared_error
from math import sqrt

from bayes_opt import BayesianOptimization

Using TensorFlow backend.


In [8]:
USE_WEIGHTED_LOSS = True

In [9]:
# Create our own weighted loss to combat label imbalance
def weighted_loss(y_true, y_pred):
    out = -(y_true * K.log(y_pred + 1e-5) / ratio_positive_outputs_per_device + (1.0 - y_true) * K.log(1.0 - y_pred + 1e-5))
    return K.mean(out, axis=-1)

In [10]:
BATCH_SIZE=1
def create_model(params):
    n_outputs = len(device_list)
    model = Sequential()
    model.add(LSTM(32, batch_input_shape=(params['batch_size'], None, feature_batch.shape[-1]), return_sequences=True, stateful=True))
    model.add(Dropout(params['dropout']))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss=weighted_loss if params['use_weighted_loss'] else 'binary_crossentropy', optimizer=params['optimizer'])
    return model

training_params = {'optimizer': 'adam', 
                   'use_weighted_loss': USE_WEIGHTED_LOSS,
                   'batch_size': BATCH_SIZE,
                   'dropout': 0.0,
                   'epochs': 500}

#model = create_model(training_params)
#model.fit(train_feature_batch, train_label_batch, epochs=training_params['epochs'], batch_size=training_params['batch_size'], verbose=1, shuffle=False)

In [11]:
test_feature_batch_flattened = test_feature_batch.reshape([-1, *test_feature_batch.shape[-1:]])
test_label_batch_flattened = test_label_batch.reshape([-1, *test_label_batch.shape[-1:]])
test_feature_batch_expanded = test_feature_batch_flattened if len(test_feature_batch_flattened.shape) == 3 else np.expand_dims(test_feature_batch_flattened, axis=1)

In [12]:
def eval_model_params(params, train_X, train_Y, test_X, test_Y):
    model = create_model(training_params)
    history = model.fit(train_X, train_Y, validation_data=(test_X, np.expand_dims(test_Y, 1)), epochs=params['epochs'], batch_size=params['batch_size'], verbose=1, shuffle=False)
    predictions = model.predict(test_X, batch_size=1)
    return model, history.history['val_loss'][-1]

print(test_feature_batch_expanded.shape)
model, result = eval_model_params(training_params, train_feature_batch, train_label_batch, test_feature_batch_expanded,test_label_batch_flattened)
print(result)

(40, 1, 8)
Train on 4 samples, validate on 40 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
2.1450266569852827


In [13]:
print(test_feature_batch_expanded.shape)
predictions = model.predict(test_feature_batch_expanded, batch_size=1)[:, 0, :]
#print(np.round(predictions, 1))

print(predictions.shape)
print(test_label_batch_flattened.shape)


(40, 1, 8)
(40, 6)
(40, 6)


In [14]:
print("Training accuracy: ", np.sum(np.round(predictions) == test_label_batch_flattened) / predictions.size) 
print("Training accuracy per device:: ", np.sum(np.round(predictions) == test_label_batch_flattened, axis=0) / predictions.shape[0]) 

print("% of 1 prediction outputs", np.sum(np.round(predictions)) / predictions.size) 
print("% of 1 label outputs", np.sum(np.round(test_label_batch_flattened)) / test_label_batch_flattened.size)

Training accuracy:  0.5583333333333333
Training accuracy per device::  [0.2   0.55  0.525 0.7   0.525 0.85 ]
% of 1 prediction outputs 0.5958333333333333
% of 1 label outputs 0.32083333333333336


In [15]:
print(test_feature_batch_expanded.shape)
print(np.expand_dims(test_label_batch_flattened, 0).shape)
model.evaluate(np.expand_dims(test_feature_batch_flattened, 1), np.expand_dims(test_label_batch_flattened, 1))

(40, 1, 8)
(1, 40, 6)


ValueError: In a stateful network, you should only pass inputs with a number of samples that can be divided by the batch size. Found: 40 samples

In [16]:
model.save('model.h5')

In [17]:
def test(model, in_file):
    feature_batch, label_batch, device_list = read_and_preprocess_data(in_file)
    print(feature_batch.shape)
    print()
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    print(np.round(predictions))
    print(np.concatenate(label_batch))

In [18]:
test(model, in_file)

n_devices =  6
(6, 20, 8)

[[1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0. 0.]
 [1. 1. 0. 1. 0. 0.]
 [1. 1. 0. 1. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0.

In [41]:
def predict_next_24h(model, in_file):
    feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch=False)
    print(feature_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    
    all_predictions = []
    
    last_features = feature_batch[-1, -1]
    last_predictions = tmp_prediction = predictions[-1]
    
    tmp_features = np.array(last_features)
    tmp_features = np.concatenate([tmp_features[:2], last_predictions])
    for i in range(24):
        print(tmp_features)
        #print(tmp_prediction)
        tmp_prediction = model.predict(np.reshape(tmp_features, [1, 1, len(tmp_features)]))
        tmp_features = np.concatenate([tmp_features[:2], tmp_prediction[0, 0]])
        
        # Increment time features
        if tmp_features[1] == 23:
            tmp_features[0] = (tmp_features[0] + 1) % 7
        tmp_features[1] = (tmp_features[1] + 1) % 24
        all_predictions += [tmp_prediction]
        
    return np.round(np.concatenate(all_predictions))

In [44]:
test_file = "challenge/data/device_activations_smaller.csv"
future_predictions = predict_next_24h(model, in_file)

n_devices =  6
(126, 8)
(1, 125, 8)
(1, 125, 8)
[2.         8.         0.91190243 0.88213348 0.59595782 0.7012167
 0.62391579 0.29530293]
[2.         9.         0.91102493 0.89612156 0.55159348 0.68520343
 0.57022959 0.29278094]
[ 2.         10.          0.90740961  0.90560627  0.48719582  0.66345352
  0.51060152  0.2800338 ]
[ 2.         11.          0.90172833  0.90972906  0.42170647  0.62879521
  0.44854927  0.25896266]
[ 2.         12.          0.89519471  0.90910804  0.3573429   0.58177882
  0.39012527  0.23086329]
[ 2.         13.          0.88727748  0.90266097  0.30780712  0.5283283
  0.34144971  0.19592878]
[ 2.         14.          0.87746763  0.89020646  0.26501456  0.46236995
  0.30031362  0.16245091]
[ 2.         15.          0.86451501  0.87246168  0.22832969  0.39337948
  0.26824841  0.13266958]
[ 2.         16.          0.84727764  0.84849769  0.19901721  0.32680047
  0.24366005  0.10763361]
[ 2.         17.          0.82445002  0.81692815  0.17837429  0.26699057
  0.22

In [45]:
label_file = "challenge/data/device_activations_small.csv"
model.reset_states()
feature_batch, label_batch, device_list = read_and_preprocess_data(label_file, batch=False)
print("a")
print(future_predictions.shape)
future_predictions = future_predictions.astype(np.int64)[:, 0]
print(future_predictions)
print(label_batch[0, -24:])
future_labels = label_batch[0, -24:]

n_devices =  6
(126, 8)
(1, 125, 8)
a
(24, 1, 6)
[[1 1 1 1 1 0]
 [1 1 0 1 1 0]
 [1 1 0 1 0 0]
 [1 1 0 1 0 0]
 [1 1 0 1 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 0 1 0 0]
 [1 1 0 1 0 0]
 [1 1 0 1 0 0]
 [1 1 0 1 0 0]]
[[0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 1 1 1 1 0]
 [1 1 1 1 1 0]
 [1 1 1 1 1 0]
 [1 1 1 1 1 1]
 [0 1 0 0 1 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 0 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]


In [46]:
print("Test accuracy: ", np.sum(np.round(future_predictions) == future_labels) / future_labels.size) 
print("Test accuracy per device:: ", np.sum(np.round(future_predictions) == future_labels, axis=0) / future_labels.shape[0]) 

print("% of 1 prediction outputs", np.sum(np.round(predictions)) / predictions.size) 
print("% of 1 label outputs", np.sum(np.round(test_label_batch_flattened)) / test_label_batch_flattened.size)

Test accuracy:  0.5972222222222222
Test accuracy per device::  [0.29166667 0.5        0.58333333 0.79166667 0.54166667 0.875     ]
% of 1 prediction outputs 0.5958333333333333
% of 1 label outputs 0.32083333333333336
