In [1]:
!python preprocess_data.py -i data.dat -o dataset.data

Final dataset with size: | train (50400L, 10L) | test (12600L, 10L) | 


In [2]:
import numpy as np
import cPickle as cp

### Load the sensor data

In [3]:
def load_dataset(filename):

    f = file(filename, 'rb')
    data = cp.load(f)
    f.close()

    X_train, y_train = data[0]
    X_test, y_test = data[1]

    print(" ..from file {}".format(filename))
    print(" ..reading instances: train {0}, test {1}".format(X_train.shape, X_test.shape))

    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)

    # The targets are casted to int8 for GPU compatibility.
    y_train = y_train.astype(np.uint8)
    y_test = y_test.astype(np.uint8)

    return X_train, y_train, X_test, y_test

In [4]:
print("Loading data...")
X_train, y_train, X_test, y_test = load_dataset('dataset.data')

Loading data...
 ..from file dataset.data
 ..reading instances: train (50400L, 10L), test (12600L, 10L)


In [5]:
first_conc_x = np.concatenate(X_train[0:15, :])
for i in range(15, X_train.shape[0], 15):
    first_conc_x = np.vstack((first_conc_x, np.concatenate(X_train[i:(i+15), :])))
X_train_kat = first_conc_x

In [6]:
first_conc_y = np.unique(y_train[0:15])[0]
for i in range(15, y_train.shape[0], 15):
    first_conc_y = np.vstack((first_conc_y, np.unique(y_train[i:(i+15)])[0]))
y_train_kat = first_conc_y

In [7]:
first_conc_x = np.concatenate(X_test[0:15, :])
for i in range(15, X_test.shape[0], 15):
    first_conc_x = np.vstack((first_conc_x, np.concatenate(X_test[i:(i+15), :])))
X_test_kat = first_conc_x

In [8]:
first_conc_y = np.unique(y_test[0:15])[0]
for i in range(15, y_test.shape[0], 15):
    first_conc_y = np.vstack((first_conc_y, np.unique(y_test[i:(i+15)])[0]))
y_test_kat = first_conc_y

In [9]:
y_train_kat.shape[0] == X_train_kat.shape[0]

True

In [10]:
y_test_kat.shape[0] == X_test_kat.shape[0]

True

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [12]:
clf = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=200, alpha=0.0001,
                     solver='sgd', verbose=1, random_state=21, tol=0.000000001)

In [13]:
clf.fit(X_train_kat, y_train_kat.flatten())

Iteration 1, loss = 1.30273423
Iteration 2, loss = 1.03505989
Iteration 3, loss = 0.96722282
Iteration 4, loss = 0.90772786
Iteration 5, loss = 0.85501034
Iteration 6, loss = 0.80590836
Iteration 7, loss = 0.76036640
Iteration 8, loss = 0.71891263
Iteration 9, loss = 0.68132020
Iteration 10, loss = 0.64683641
Iteration 11, loss = 0.61537391
Iteration 12, loss = 0.58617416
Iteration 13, loss = 0.55959849
Iteration 14, loss = 0.53547315
Iteration 15, loss = 0.51380886
Iteration 16, loss = 0.49325368
Iteration 17, loss = 0.47397960
Iteration 18, loss = 0.45607853
Iteration 19, loss = 0.43920198
Iteration 20, loss = 0.42321740
Iteration 21, loss = 0.40814168
Iteration 22, loss = 0.39380710
Iteration 23, loss = 0.38012214
Iteration 24, loss = 0.36696654
Iteration 25, loss = 0.35467338
Iteration 26, loss = 0.34312745
Iteration 27, loss = 0.33173138
Iteration 28, loss = 0.32120958
Iteration 29, loss = 0.31145335
Iteration 30, loss = 0.30179286
Iteration 31, loss = 0.29253439
Iteration 32, los

Iteration 255, loss = 0.03711301
Iteration 256, loss = 0.03693919
Iteration 257, loss = 0.03675196
Iteration 258, loss = 0.03660014
Iteration 259, loss = 0.03644858
Iteration 260, loss = 0.03627801
Iteration 261, loss = 0.03615099
Iteration 262, loss = 0.03592963
Iteration 263, loss = 0.03579690
Iteration 264, loss = 0.03570999
Iteration 265, loss = 0.03586783
Iteration 266, loss = 0.03543364
Iteration 267, loss = 0.03542136
Iteration 268, loss = 0.03518881
Iteration 269, loss = 0.03506187
Iteration 270, loss = 0.03498656
Iteration 271, loss = 0.03476905
Iteration 272, loss = 0.03455870
Iteration 273, loss = 0.03463323
Iteration 274, loss = 0.03443644
Iteration 275, loss = 0.03418078
Iteration 276, loss = 0.03434500
Iteration 277, loss = 0.03405731
Iteration 278, loss = 0.03371604
Iteration 279, loss = 0.03363327
Iteration 280, loss = 0.03352132
Iteration 281, loss = 0.03348851
Iteration 282, loss = 0.03344232
Iteration 283, loss = 0.03327198
Iteration 284, loss = 0.03352101
Iteration 

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=21, shuffle=True,
       solver='sgd', tol=1e-09, validation_fraction=0.1, verbose=1,
       warm_start=False)

In [14]:
y_pred_kat = clf.predict(X_test_kat)

In [15]:
accuracy_score(y_test_kat.flatten(), y_pred_kat)


0.986904761904762

### Test Dataset

In [16]:
import os
import zipfile
import argparse
import numpy as np
import cPickle as cp
from io import BytesIO
from pandas import Series
import random

In [17]:
data = np.genfromtxt('testdata.csv', delimiter=',')

In [18]:
NB_SENSOR_CHANNELS = 10

NORM_MAX_THRESHOLDS = [200, 200, 200, 250000, 250000, 250000, 100, 100, 100, 100]

NORM_MIN_THRESHOLDS = [-200, -200, -200, -250000, -250000, -250000, -100, -100, -100, -100]

In [19]:
def normalize(data, max_list, min_list):
    max_list, min_list = np.array(max_list), np.array(min_list)
    diffs = max_list - min_list
    for i in np.arange(data.shape[1]):
        data[:, i] = (data[:, i]-min_list[i])/diffs[i]
    data[data > 1] = 0.99
    data[data < 0] = 0.00
    return data

In [20]:
def process_dataset_file(dataset):
    # Colums are segmentd into features and labels
    data_x = data

    # Perform linear interpolation
    data_x = np.array([Series(i).interpolate() for i in data_x.T]).T

    # Remaining missing data are converted to zero
    data_x[np.isnan(data_x)] = 0

    # All sensor channels are normalized
    data_x = normalize(data_x, NORM_MAX_THRESHOLDS, NORM_MIN_THRESHOLDS)

    return data_x

In [21]:
def generate_data(data):
    data_x = np.empty((0, 10))
    x = process_dataset_file(data)
    data_x = np.vstack((data_x, x))

    return data_x

In [22]:
test_data = generate_data(data)

In [23]:
first_conc_x = np.concatenate(test_data[0:15, :])
for i in range(15, test_data.shape[0], 15):
    first_conc_x = np.vstack((first_conc_x, np.concatenate(test_data[i:(i+15), :])))
test_data_kat = first_conc_x

In [24]:
clf.predict(test_data_kat).flatten()

array([4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 4, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2], dtype=uint8)