In [1]:
# import packages

import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt 
import math
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

---
# Data Loading and Preprocessing

In [98]:
# load datasets
def visualisation():
    for i in range(19):
        df = pd.read_csv('daliac/dataset_' + str(i+1) + '.txt', sep=',', header=None)
    for i in range(1, 14):
        df_exercise = df[df[24] == i].values
        plt.plot(df_exercise[500:1500, :])
        plt.show()

In [99]:
# remove noise
def removeNoise():
    b, a = signal.butter(4, 0.04, 'lowpass', analog=False)
    for i in range(24):
        df_exercise[:,i] = signal.lfilter(b, a, df_exercise[:, i])
    plt.plot(df_exercise[500:1500, :])
    plt.show()

---
# Feature Engineering

In [170]:
seg = df_exercise[500:1500, 0:]
for j in range(24):
    dMin = np.min(seg[:, j])
    dMax = np.max(seg[:, j])
    dMean = np.mean(seg[:, j])

    print(dMin, dMax, dMean)

-1.925778118680744 0.3197391374776279 -0.4858332689454794
-0.19987240984775426 1.9906532360686415 0.7962199544500775
-1.7709444962430094 0.29359669841436764 -0.4887516775618998
-220.76413517379243 369.30496048463993 89.28096995942234
-320.8695151202439 404.78867979228204 48.61833579157605
-238.9855060432426 474.358805213539 8.86279077302094
-0.3904758179492516 0.09286363627579444 -0.09570780900204111
-2.4421721210093215 0.9088444635217665 -0.8286173153841635
-0.7010040288315011 1.6312131255161466 0.49317535746973074
-60.948018377397695 75.83585609436663 -0.9377025033183
-35.631631377228324 21.41704942383088 -0.19239564138149517
-14.839995339889624 16.703325638218615 -0.3093890146905477
-2.7278867223972565 0.917996279334936 -0.9468731319901208
-0.21825736581850055 0.1313342153844715 -0.05952275147529261
-0.3014231858569701 0.772150661075941 0.2535961080727579
-29.291308686773633 26.363366623058475 0.5846513854611051
-28.735150662822438 27.04739073131202 1.0982116846163494
-20.3468048789

In [257]:
def featureEngineering():    
    for i in range(19):
        df = pd.read_csv('daliac/dataset_' + str(i + 1) + '.txt', sep=',', header=None)
        print('deal with dataset ' + str(i + 1))
        for c in range(1, 14):
            activity_data = df[df[24] == c].values
            b, a = signal.butter(4, 0.04, 'low', analog=False)
            for j in range(24):
                activity_data[:, j] = signal.lfilter(b, a, activity_data[:, j])

            training = np.empty(shape=(0, 10))

            testing = np.empty(shape=(0, 10))
            

            datat_len = len(activity_data)
            training_len = math.floor(datat_len * 0.8)
            training_data = activity_data[:training_len, :]
            testing_data = activity_data[training_len:, :]

            # data segementation: for time series data, we need to segment the whole time series, and then extract features from each period of time
            # to represent the raw data. In this example code, we define each period of time contains 1000 data points. Each period of time contains 
            # different data points. You may consider overlap segmentation, which means consecutive two segmentation share a part of data points, to 
            # get more feature samples.
            training_sample_number = training_len // 1000 + 1
            testing_sample_number = (datat_len - training_len) // 1000 + 1

            for s in range(training_sample_number):
                if s < training_sample_number - 1:
                    sample_data = training_data[1000*s:1000*(s + 1), :]
                else:
                    sample_data = training_data[1000*s:, :]
                # in this example code, only three accelerometer data in wrist sensor is used to extract three simple features: min, max, and mean value in
                # a period of time. Finally we get 9 features and 1 label to construct feature dataset. You may consider all sensors' data and extract more

                feature_sample = []
                for i in range(3):
                    feature_sample.append(np.min(sample_data[:, i]))
                    feature_sample.append(np.max(sample_data[:, i]))
                    feature_sample.append(np.mean(sample_data[:, i]))
                feature_sample.append(sample_data[0, -1])
                feature_sample = np.array([feature_sample])
                training = np.concatenate((training, feature_sample), axis=0)
            
            for s in range(testing_sample_number):
                if s < training_sample_number - 1:
                    sample_data = testing_data[1000*s:1000*(s + 1), :]
                else:
                    sample_data = testing_data[1000*s:, :]

                feature_sample = []
                for i in range(3):
                    feature_sample.append(np.min(sample_data[:, i]))
                    feature_sample.append(np.max(sample_data[:, i]))
                    feature_sample.append(np.mean(sample_data[:, i]))
                feature_sample.append(sample_data[0, -1])
                feature_sample = np.array([feature_sample])
                testing = np.concatenate((testing, feature_sample), axis=0)

    df_training = pd.DataFrame(training)
    df_testing = pd.DataFrame(testing)
    df_training.to_csv('training_data.csv', index=None, header=None)
    df_testing.to_csv('testing_data.csv', index=None, header=None)

In [258]:
df_training.size

90

In [259]:
df_testing.size

30

In [260]:
if __name__ == '__main__':
#     visualisation()
#     removeNoise()
    featureEngineering()

deal with dataset 1
deal with dataset 2
deal with dataset 3
deal with dataset 4
deal with dataset 5
deal with dataset 6
deal with dataset 7
deal with dataset 8
deal with dataset 9
deal with dataset 10
deal with dataset 11
deal with dataset 12
deal with dataset 13
deal with dataset 14
deal with dataset 15
deal with dataset 16
deal with dataset 17
deal with dataset 18
deal with dataset 19
