In [1]:
import itertools
import numpy as np
import pandas as pd
import sys
import os
from datetime import timedelta

from preprocess import read_and_preprocess_data

In [2]:
!python -m pip install pandas



In [3]:
in_file = "challenge/data/device_activations_small.csv"

In [4]:
feature_batch, label_batch, device_list = read_and_preprocess_data(in_file)
print("Feature batch: ", feature_batch.shape)
print("Label batch: ", label_batch.shape)

File challenge/data/device_activations_small.csv has 125 timesteps (hours)
Feature batch:  (6, 20, 8)
Label batch:  (6, 20, 6)


In [5]:
train_ratio = 0.9
train_len = int(train_ratio * len(feature_batch))
print("Train len: ", train_len)

train_feature_batch = feature_batch[:train_len]
test_feature_batch = feature_batch[train_len:]
train_label_batch = label_batch[:train_len]
test_label_batch = label_batch[train_len:]

print(train_feature_batch.shape)
print(test_feature_batch.shape)

Train len:  5
(5, 20, 8)
(1, 20, 8)


In [6]:
def calc_ratio_positive_outputs_per_device(labels):
    ratio_per_device = np.sum(labels, axis=0) / labels.shape[0]
    print("Percentage of positive outputs per device: ", ratio_per_device)
    return np.array(ratio_per_device)
ratio_positive_outputs_per_device = calc_ratio_positive_outputs_per_device(label_batch.reshape([-1, label_batch.shape[-1]]))

Percentage of positive outputs per device:  [0.21666667 0.23333333 0.15833333 0.13333333 0.18333333 0.05      ]


In [7]:
import matplotlib.pyplot as plt
import math
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras import objectives
from keras import backend as K

from sklearn.metrics import mean_squared_error
from math import sqrt

from bayes_opt import BayesianOptimization

Using TensorFlow backend.


In [8]:
USE_WEIGHTED_LOSS = True

In [9]:
# Create our own weighted loss to combat label imbalance
def weighted_loss(y_true, y_pred):
    out = -(y_true * K.log(y_pred + 1e-5) / ratio_positive_outputs_per_device + (1.0 - y_true) * K.log(1.0 - y_pred + 1e-5))
    return K.mean(out, axis=-1)

In [10]:
BATCH_SIZE=1
def create_model(params):
    n_outputs = len(device_list)
    model = Sequential()
    model.add(LSTM(4, batch_input_shape=(params['batch_size'], None, feature_batch.shape[-1]), return_sequences=True, stateful=True))
    model.add(Dropout(params['dropout']))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss=weighted_loss if params['use_weighted_loss'] else 'binary_crossentropy', optimizer=params['optimizer'])
    return model

training_params = {'optimizer': 'adam', 
                   'use_weighted_loss': USE_WEIGHTED_LOSS,
                   'batch_size': BATCH_SIZE,
                   'dropout': 0.0,
                   'epochs': 50}

#model = create_model(training_params)
#model.fit(train_feature_batch, train_label_batch, epochs=training_params['epochs'], batch_size=training_params['batch_size'], verbose=1, shuffle=False)

In [11]:
test_feature_batch_flattened = test_feature_batch.reshape([-1, *test_feature_batch.shape[-1:]])
test_label_batch_flattened = test_label_batch.reshape([-1, *test_label_batch.shape[-1:]])
test_feature_batch_expanded = test_feature_batch_flattened if len(test_feature_batch_flattened.shape) == 3 else np.expand_dims(test_feature_batch_flattened, axis=1)

In [12]:
def eval_model_params(params, train_X, train_Y, test_X, test_Y):
    model = create_model(training_params)
    history = model.fit(train_X, train_Y, validation_data=(test_X, np.expand_dims(test_Y, 1)), epochs=params['epochs'], batch_size=params['batch_size'], verbose=1, shuffle=False)
    return model, history.history['val_loss'][-1]

print(test_feature_batch_expanded.shape)
model, result = eval_model_params(training_params, train_feature_batch, train_label_batch, test_feature_batch_expanded,test_label_batch_flattened)
print(result)

(20, 1, 8)
Train on 5 samples, validate on 20 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
1.6637468039989471


In [13]:
predictions = model.predict(test_feature_batch_expanded, batch_size=1)
predictions = np.squeeze(predictions)

print(predictions.shape)
print(test_label_batch_flattened.shape)


(20, 6)
(20, 6)


In [14]:
print("Training accuracy: ", np.sum(np.round(predictions) == test_label_batch_flattened) / predictions.size) 
print("Training accuracy per device:: ", np.sum(np.round(predictions) == test_label_batch_flattened, axis=0) / predictions.shape[0]) 

print("% of 1 prediction outputs", np.sum(np.round(predictions)) / predictions.size) 
print("% of 1 label outputs", np.sum(np.round(test_label_batch_flattened)) / test_label_batch_flattened.size)

Training accuracy:  0.65
Training accuracy per device::  [0.4  0.7  0.9  0.55 0.55 0.8 ]
% of 1 prediction outputs 0.39166666666666666
% of 1 label outputs 0.325


In [15]:
model.save('model.h5')

In [16]:
def test(model, in_file):
    feature_batch, label_batch, device_list = read_and_preprocess_data(in_file)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    print(np.round(predictions))
    print(np.concatenate(label_batch))

In [17]:
test(model, in_file)

File challenge/data/device_activations_small.csv has 125 timesteps (hours)
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0.]
 [1. 1. 1. 0. 1. 0.]
 [1. 1. 0. 0. 1. 0.]
 [1. 1. 0. 0. 1. 0.]
 [1. 1. 0. 0. 1. 0.]
 [1. 1. 0. 0. 1. 0.]
 [1. 1. 0. 0. 1. 0.]
 [1. 1. 0. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0.]
 [1. 0. 1. 0. 1. 0.]
 [1. 0. 1. 0. 1. 0.]
 [1. 0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 

In [18]:
def predict_next_24h(model, in_file):
    feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch=False)
    print(feature_batch.shape)
    print(label_batch.shape)
    predictions = np.concatenate(model.predict(feature_batch, batch_size=1), axis=0)
    
    all_predictions = []
    
    last_features = feature_batch[-1, -1]
    last_predictions = tmp_prediction = predictions[-1]
    
    tmp_features = np.array(last_features)
    tmp_features = np.concatenate([tmp_features[:2], last_predictions])
    for i in range(24):
        print(tmp_features)
        #print(tmp_prediction)
        tmp_prediction = model.predict(np.reshape(tmp_features, [1, 1, len(tmp_features)]))
        tmp_features = np.concatenate([tmp_features[:2], tmp_prediction[0, 0]])
        
        # Increment time features
        if tmp_features[1] == 23:
            tmp_features[0] = (tmp_features[0] + 1) % 7
        tmp_features[1] = (tmp_features[1] + 1) % 24
        all_predictions += [tmp_prediction]
        
    return np.round(np.concatenate(all_predictions))

In [22]:
test_file = "challenge/data/device_activations_smaller.csv"
future_predictions = predict_next_24h(model, test_file)

File challenge/data/device_activations_smaller.csv has 101 timesteps (hours)
(1, 101, 8)
(1, 101, 6)
[1.         8.         0.77860647 0.68049347 0.67319858 0.48214319
 0.50550455 0.47616789]
[1.         9.         0.76215994 0.67184883 0.61911422 0.46631485
 0.550753   0.45250982]
[ 1.         10.          0.77873981  0.69111705  0.62019163  0.46910295
  0.55646479  0.44288379]
[ 1.         11.          0.78449249  0.70032597  0.62080002  0.47093201
  0.55876052  0.44107732]
[ 1.         12.          0.78311628  0.7005477   0.62543631  0.47316068
  0.55331606  0.44604701]
[ 1.         13.          0.7778784   0.69623584  0.62634516  0.47462222
  0.54749453  0.45005   ]
[ 1.         14.          0.76835316  0.68731016  0.62264574  0.47530323
  0.54161876  0.45259631]
[ 1.         15.          0.75672191  0.67616898  0.61688381  0.47636336
  0.53408718  0.45377833]
[ 1.         16.          0.74262661  0.66273099  0.60850155  0.47766167
  0.52539051  0.45347774]
[ 1.         17.        

In [24]:
label_file = "challenge/data/device_activations_small.csv"
model.reset_states()
feature_batch, label_batch, device_list = read_and_preprocess_data(label_file, batch=False)
label_batch = label_batch.squeeze()
print(label_batch.shape)
print(future_predictions.shape)
future_predictions = np.squeeze(future_predictions.astype(np.int64))
print(future_predictions)
print(label_batch[-24:])
future_labels = label_batch[-24:]

File challenge/data/device_activations_small.csv has 125 timesteps (hours)
(125, 6)
(24, 6)
[[1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 0 0]
 [1 1 1 0 0 0]
 [1 1 1 0 0 0]
 [1 1 1 0 0 0]
 [1 1 1 0 0 0]
 [1 1 1 0 0 0]
 [0 1 0 0 0 0]
 [0 1 0 0 1 0]
 [1 1 0 0 1 0]
 [1 1 0 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 0 1 0]]
[[0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 0 1 1 1 0]
 [0 1 1 1 1 0]
 [1 1 1 1 1 0]
 [1 1 1 1 1 0]
 [1 1 1 1 1 1]
 [0 1 0 0 1 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 0 0 1 0]
 [1 1 1 0 1 0]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]


In [25]:
print(future_predictions.shape)
print(future_labels.shape)

print("Test accuracy: ", np.sum(np.round(future_predictions) == future_labels) / future_labels.size) 
print("Test accuracy per device:: ", np.sum(np.round(future_predictions) == future_labels, axis=0) / future_labels.shape[0]) 


(24, 6)
(24, 6)
Test accuracy:  0.6180555555555556
Test accuracy per device::  [0.375      0.41666667 0.625      0.58333333 0.83333333 0.875     ]
