In [1]:
import itertools
import numpy as np
import pandas as pd
import sys
import os
from datetime import timedelta

from preprocess import read_and_preprocess_data

In [2]:
in_file = "challenge/data/device_activations_train.csv"
val_in_file = "challenge/data/device_activations_val.csv"
test_in_file = "challenge/data/device_activations_test.csv"
BATCH_SIZE = 16
device_list=['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7']

In [3]:
feature_batch, label_batch, device_list = read_and_preprocess_data(in_file, batch_size=BATCH_SIZE, device_list=device_list, sequence_start_shift=30)
print(feature_batch[0])
print(feature_batch[1])
print(feature_batch[-1])
print("Feature batch: ", feature_batch.shape)
print("Label batch: ", label_batch.shape)

File challenge/data/device_activations_train.csv has 1013 timesteps (hours) until now
initial features shape:  (1013, 9)
Full sequence length:  525
Sequence 0 has start index 0 and end index 525
(525, 9)
Sequence 1 has start index 30 and end index 555
(525, 9)
Sequence 2 has start index 60 and end index 585
(525, 9)
Sequence 3 has start index 90 and end index 615
(525, 9)
Sequence 4 has start index 120 and end index 645
(525, 9)
Sequence 5 has start index 150 and end index 675
(525, 9)
Sequence 6 has start index 180 and end index 705
(525, 9)
Sequence 7 has start index 210 and end index 735
(525, 9)
Sequence 8 has start index 240 and end index 765
(525, 9)
Sequence 9 has start index 270 and end index 795
(525, 9)
Sequence 10 has start index 300 and end index 825
(525, 9)
Sequence 11 has start index 330 and end index 855
(525, 9)
Sequence 12 has start index 360 and end index 885
(525, 9)
Sequence 13 has start index 390 and end index 915
(525, 9)
Sequence 14 has start index 420 and end i

In [4]:
val_features, val_labels, _ = read_and_preprocess_data(val_in_file, batch_size=1, device_list=device_list)
test_features, test_labels, _ = read_and_preprocess_data(test_in_file, batch_size=1, device_list=device_list)

File challenge/data/device_activations_val.csv has 287 timesteps (hours) until now
File challenge/data/device_activations_test.csv has 175 timesteps (hours) until now


In [5]:
def calc_ratio_positive_outputs_per_device(labels):
    ratio_per_device = np.sum(labels, axis=0) / labels.shape[0]
    print("Percentage of positive outputs per device: ", ratio_per_device)
    return np.array(ratio_per_device)
ratio_positive_outputs_per_device = calc_ratio_positive_outputs_per_device(label_batch.reshape([-1, label_batch.shape[-1]]))

Percentage of positive outputs per device:  [0.0777381  0.2475     0.16202381 0.24214286 0.23559524 0.34678571
 0.03559524]


In [101]:
import matplotlib.pyplot as plt
import math
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.regularizers import l2
from keras import objectives
from keras import backend as K
from keras import optimizers

from sklearn.metrics import mean_squared_error
from math import sqrt

from bayes_opt import BayesianOptimization

In [102]:
USE_WEIGHTED_LOSS = True

In [103]:
# Create our own weighted loss to combat label imbalance
def weighted_loss(y_true, y_pred):
    out = -(y_true * K.log(y_pred + 1e-5) / ratio_positive_outputs_per_device + (1.0 - y_true) * K.log(1.0 - y_pred + 1e-5))
    return K.mean(out, axis=-1)

In [104]:
def create_model(params):
    model = Sequential()
    model.add(LSTM(params['lstm_units'], batch_input_shape=(params['batch_size'], None, params['n_features']), return_sequences=True, stateful=True, kernel_regularizer=l2(params['reg'])))
    model.add(Dropout(params['dropout']))
    model.add(Dense(params['n_outputs'], activation='sigmoid'))
    
    adam = optimizers.Adam(lr=params['lr'])
    model.compile(loss=weighted_loss if params['use_weighted_loss'] else 'binary_crossentropy', optimizer=adam)
    return model

training_params = {'optimizer': 'adam', 
                   'use_weighted_loss': USE_WEIGHTED_LOSS,
                   'batch_size': BATCH_SIZE,
                   'dropout': 0.2,
                   'epochs': 250,
                   'n_outputs': len(device_list),
                   'n_features': feature_batch.shape[-1],
                   'lstm_units': 8,
                   'devices': device_list}

In [108]:
def predict_24h(model, features, labels):
    predictions = np.squeeze(model.predict(np.expand_dims(features, 0), batch_size=1))  # (n_timesteps, n_outputs)

    all_predictions = []

    last_features = np.squeeze(features)[-1]
    last_predictions = tmp_prediction = predictions[-1]

    tmp_features = np.array(last_features)
    tmp_features = np.concatenate([tmp_features[:2], last_predictions])
    for i in range(24):
        # print(tmp_prediction)
        tmp_prediction = np.round(model.predict(np.reshape(tmp_features, [1, 1, len(tmp_features)])))
        tmp_features = np.concatenate([tmp_features[:2], tmp_prediction[0, 0]])
        
        # Increment time features
        if tmp_features[1] == 23:
            tmp_features[0] = (tmp_features[0] + 1) % 7
        tmp_features[1] = (tmp_features[1] + 1) % 24
        all_predictions += [tmp_prediction]

    return np.concatenate(all_predictions)

def calc_accuracy(model, params, test_X, test_Y):
    # Hack around Keras batch size restriction (to have same for training/test)
    model.save('tmp_model.h5')
    test_params = dict(params)
    test_params['batch_size'] = 1
    test_model = create_model(test_params)
    test_model.load_weights('tmp_model.h5')
    os.remove('tmp_model.h5')
    n = test_X.shape[1]
    acc_accumulated = 0.0
    for i in range(48, n - 24):
        predictions = np.squeeze(predict_24h(test_model, test_X[0, :i], test_Y[0, :i]))
        true_labels = test_Y[0, i:i+24]
        acc = np.sum(np.round(predictions) == true_labels) / predictions.size
        acc_accumulated += acc
        #print("Val accuracy: ", np.sum(np.round(predictions) == true_labels) / predictions.size) 
        #print("Val accuracy per device:: ", np.sum(np.round(predictions) == true_labels, axis=0) / predictions.shape[0]) 

    return acc_accumulated / (n-26)


In [109]:
def train_model_with_params(params, train_X, train_Y):
    model = create_model(params)
    history = model.fit(train_X, train_Y, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0, shuffle=False)
    return model

def eval_model_params(params, train_X, train_Y, val_X, val_Y):
    model = train_model_with_params(params, feature_batch, label_batch)
    val_acc = calc_accuracy(model, params, val_features, val_labels)
    return model, val_acc


In [112]:
gs_results = []
for lr in [1e-3, 1e-4, 1e-5]:
    for do in [0.0, 0.2, 0.5]:
        for units in [16, 32, 64, 128, 256, 512]:
            for reg in [0.0, 0.01, 0.1]:
                for n_epochs in [250]:
                    K.clear_session()
                    tmp_params = {'lr': lr, 
                                   'use_weighted_loss': USE_WEIGHTED_LOSS,
                                   'batch_size': BATCH_SIZE,
                                   'dropout': do,
                                   'epochs': n_epochs,
                                   'n_outputs': len(device_list),
                                   'n_features': feature_batch.shape[-1],
                                   'lstm_units': units,
                                   'devices': device_list,
                                   'reg': reg}
                    model, val_acc = eval_model_params(tmp_params, feature_batch, label_batch, val_features, val_labels)
                    gs_results += [(val_acc, tmp_params)]
                    print("Tmp result: ", val_acc, tmp_params)
                
for x in sorted(gs_results, key=lambda x: x[0], reverse=True):
    print("Acc: {0}, params: {1}".format(x[0], x[1]))

Tmp result:  0.6199142492245947 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.0, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 16, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.0}
Tmp result:  0.6267560664112389 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.0, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 16, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.01}
Tmp result:  0.6368363437328957 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.0, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 16, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.1}
Tmp result:  0.6607598978288644 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.0, 'epochs': 250, 'n_outputs': 7, 'n_features': 9

Tmp result:  0.6614896916621068 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.5, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 64, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.0}
Tmp result:  0.6448640758985581 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.5, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 64, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.01}
Tmp result:  0.6470762634555747 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.5, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 64, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.1}
Tmp result:  0.666324575807335 {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.5, 'epochs': 250, 'n_outputs': 7, 'n_features': 9,

Tmp result:  0.6470306513409962 {'lr': 0.0001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.2, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 128, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.1}
Tmp result:  0.5475962415617592 {'lr': 0.0001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.5, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 16, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.0}
Tmp result:  0.614395183360701 {'lr': 0.0001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.5, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 16, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.01}
Tmp result:  0.6250456121145779 {'lr': 0.0001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.5, 'epochs': 250, 'n_outputs': 7, 'n_features

In [113]:
#Acc: 0.6819011129355954, params: {'lr': 0.001, 'use_weighted_loss': True, 'batch_size': 16, 'dropout': 0.0, 'epochs': 250, 'n_outputs': 7, 'n_features': 9, 'lstm_units': 128, 'devices': ['device_1', 'device_2', 'device_3', 'device_4', 'device_5', 'device_6', 'device_7'], 'reg': 0.01}

best_params = {'lr': 0.001, 
               'use_weighted_loss': True,
               'batch_size': BATCH_SIZE,
               'dropout': 0.0,
               'epochs': 250,
               'n_outputs': len(device_list),
               'n_features': feature_batch.shape[-1],
               'lstm_units': 512,
               'devices': device_list,
               'reg': 0.01}

model, val_acc = eval_model_params(best_params, feature_batch, label_batch, val_features, val_labels)
print("Val acc for model", val_acc)

Val acc for model 0.6841133004926107


In [115]:
val_acc = calc_accuracy(model, best_params, val_features, val_labels)
print("Val acc: ", val_acc)
test_acc = calc_accuracy(model, best_params, test_features, test_labels)
print("Test acc: ", test_acc)

Val acc:  0.6841133004926107
Test acc:  0.6369447107702141


In [13]:
model.reset_states()
print(test_feature_batch_expanded.shape)

predictions = model.predict(test_feature_batch_expanded, batch_size=BATCH_SIZE)
predictions = np.squeeze(predictions)

print(predictions.shape)
print(test_label_batch_flattened.shape)

print(np.round(predictions)[:-24])
print(test_label_batch_flattened[:-24])

NameError: name 'test_feature_batch_expanded' is not defined

In [None]:
print("Val accuracy: ", np.sum(np.round(predictions) == test_label_batch_flattened) / predictions.size) 
print("Val accuracy per device:: ", np.sum(np.round(predictions) == test_label_batch_flattened, axis=0) / predictions.shape[0]) 

print("% of 1 prediction outputs", np.sum(np.round(predictions)) / predictions.size) 
print("% of 1 label outputs", np.sum(np.round(test_label_batch_flattened)) / test_label_batch_flattened.size)

In [None]:
model.save('model.h5')
import json

with open('params.json', 'w', encoding='utf-8') as fp:
    json.dump(training_params, fp)