In [1]:
!python preprocess_data.py -i data.csv -o dataset.data

Final dataset with size: | train (69600L, 10L) | test (17400L, 10L) | 


In [2]:
import numpy as np
import cPickle as cp

### Load the sensor data

In [3]:
def load_dataset(filename):

    f = file(filename, 'rb')
    data = cp.load(f)
    f.close()

    X_train, y_train = data[0]
    X_test, y_test = data[1]

    print(" ..from file {}".format(filename))
    print(" ..reading instances: train {0}, test {1}".format(X_train.shape, X_test.shape))

    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)

    # The targets are casted to int8 for GPU compatibility.
    y_train = y_train.astype(np.uint8)
    y_test = y_test.astype(np.uint8)

    return X_train, y_train, X_test, y_test

In [4]:
print("Loading data...")
X_train, y_train, X_test, y_test = load_dataset('dataset.data')

Loading data...
 ..from file dataset.data
 ..reading instances: train (69600L, 10L), test (17400L, 10L)


In [5]:
first_conc_x = np.concatenate(X_train[0:15, :])
for i in range(15, X_train.shape[0], 15):
    first_conc_x = np.vstack((first_conc_x, np.concatenate(X_train[i:(i+15), :])))
X_train_mlp = first_conc_x

In [6]:
first_conc_y = np.unique(y_train[0:15])[0]
for i in range(15, y_train.shape[0], 15):
    first_conc_y = np.vstack((first_conc_y, np.unique(y_train[i:(i+15)])[0]))
y_train_mlp = first_conc_y

In [7]:
first_conc_x = np.concatenate(X_test[0:15, :])
for i in range(15, X_test.shape[0], 15):
    first_conc_x = np.vstack((first_conc_x, np.concatenate(X_test[i:(i+15), :])))
X_test_mlp = first_conc_x

In [8]:
first_conc_y = np.unique(y_test[0:15])[0]
for i in range(15, y_test.shape[0], 15):
    first_conc_y = np.vstack((first_conc_y, np.unique(y_test[i:(i+15)])[0]))
y_test_mlp = first_conc_y

In [9]:
y_train_mlp.shape[0] == X_train_mlp.shape[0]

True

In [10]:
y_test_mlp.shape[0] == X_test_mlp.shape[0]

True

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [12]:
clf = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=300, alpha=0.0001,
                     solver='sgd', verbose=1, random_state=21, tol=0.000000001)

In [13]:
clf.fit(X_train_mlp, y_train_mlp.flatten())

Iteration 1, loss = 1.39514149
Iteration 2, loss = 1.23505313
Iteration 3, loss = 1.14674317
Iteration 4, loss = 1.07030619
Iteration 5, loss = 1.00199055
Iteration 6, loss = 0.93749833
Iteration 7, loss = 0.87684962
Iteration 8, loss = 0.82123448
Iteration 9, loss = 0.77208114
Iteration 10, loss = 0.73010963
Iteration 11, loss = 0.69006401
Iteration 12, loss = 0.65411497
Iteration 13, loss = 0.62000622
Iteration 14, loss = 0.58724910
Iteration 15, loss = 0.55512142
Iteration 16, loss = 0.52527449
Iteration 17, loss = 0.49703764
Iteration 18, loss = 0.46928826
Iteration 19, loss = 0.44337329
Iteration 20, loss = 0.41912620
Iteration 21, loss = 0.39701737
Iteration 22, loss = 0.37745815
Iteration 23, loss = 0.35888171
Iteration 24, loss = 0.34178157
Iteration 25, loss = 0.32681061
Iteration 26, loss = 0.31210744
Iteration 27, loss = 0.29941361
Iteration 28, loss = 0.28816790
Iteration 29, loss = 0.27739736
Iteration 30, loss = 0.26771825
Iteration 31, loss = 0.26033256
Iteration 32, los

Iteration 253, loss = 0.03150856
Iteration 254, loss = 0.03074371
Iteration 255, loss = 0.03118407
Iteration 256, loss = 0.03056706
Iteration 257, loss = 0.03035672
Iteration 258, loss = 0.03020003
Iteration 259, loss = 0.02985763
Iteration 260, loss = 0.02988319
Iteration 261, loss = 0.02933314
Iteration 262, loss = 0.02943504
Iteration 263, loss = 0.02926089
Iteration 264, loss = 0.02911493
Iteration 265, loss = 0.02923029
Iteration 266, loss = 0.03045211
Iteration 267, loss = 0.02873045
Iteration 268, loss = 0.02856891
Iteration 269, loss = 0.02813155
Iteration 270, loss = 0.02795313
Iteration 271, loss = 0.02831821
Iteration 272, loss = 0.02768791
Iteration 273, loss = 0.02756695
Iteration 274, loss = 0.02749160
Iteration 275, loss = 0.02701242
Iteration 276, loss = 0.02717625
Iteration 277, loss = 0.02696973
Iteration 278, loss = 0.02666495
Iteration 279, loss = 0.02669303
Iteration 280, loss = 0.02681484
Iteration 281, loss = 0.02630378
Iteration 282, loss = 0.02596486
Iteration 



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=21, shuffle=True,
       solver='sgd', tol=1e-09, validation_fraction=0.1, verbose=1,
       warm_start=False)

In [14]:
y_pred_mlp = clf.predict(X_test_mlp)

In [15]:
accuracy_score(y_test_mlp.flatten(), y_pred_mlp)


0.9844827586206897

### Test Dataset

In [16]:
import os
import zipfile
import argparse
import numpy as np
import cPickle as cp
from io import BytesIO
from pandas import Series
import random

In [17]:
data = np.genfromtxt('testdata.csv', delimiter=',')

In [18]:
NB_SENSOR_CHANNELS = 10

NORM_MAX_THRESHOLDS = [200, 200, 200, 250000, 250000, 250000, 100, 100, 100, 100]

NORM_MIN_THRESHOLDS = [-200, -200, -200, -250000, -250000, -250000, -100, -100, -100, -100]

In [19]:
def normalize(data, max_list, min_list):
    max_list, min_list = np.array(max_list), np.array(min_list)
    diffs = max_list - min_list
    for i in np.arange(data.shape[1]):
        data[:, i] = (data[:, i]-min_list[i])/diffs[i]
    data[data > 1] = 0.99
    data[data < 0] = 0.00
    return data

In [20]:
def process_dataset_file(dataset):
    # Colums are segmentd into features and labels
    data_x = dataset[:,2:12]

    # Perform linear interpolation
    data_x = np.array([Series(i).interpolate() for i in data_x.T]).T

    # Remaining missing data are converted to zero
    data_x[np.isnan(data_x)] = 0

    # All sensor channels are normalized
    data_x = normalize(data_x, NORM_MAX_THRESHOLDS, NORM_MIN_THRESHOLDS)

    return data_x

In [21]:
def generate_data(data):
    data_x = np.empty((0, 10))
    x = process_dataset_file(data)
    data_x = np.vstack((data_x, x))

    return data_x

In [22]:
test_data = generate_data(data)

In [23]:
first_conc_x = np.concatenate(test_data[0:15, :])
for i in range(15, test_data.shape[0], 15):
    first_conc_x = np.vstack((first_conc_x, np.concatenate(test_data[i:(i+15), :])))
test_data_mlp = first_conc_x

In [24]:
clf.predict(test_data_mlp).flatten()

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=uint8)