In [1]:
# import packages

import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt 
import math
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

---
# Data Loading and Preprocessing

In [2]:
# load datasets
def visualisation():
# load all daliac 19 datasets 
    for i in range(19):
        df = pd.read_csv('daliac/dataset_' + str(i+1) + '.txt', sep=',', header=None)
# get all activity data from all datasets
    for i in range(1, 14):
        df_exercise = df[df[24] == i].values
        plt.plot(df_exercise[500:1500, :])
        plt.show()

In [3]:
# remove noise
def removeNoise():
# Load dataset 15
    df_15 = pd.read_csv('daliac/dataset_15.txt', sep=',', header=None)
# select rope jumping values
    df_rope = df_15[df_15[24] == 13].values
# remove noise from rope jumping
    b, a = signal.butter(4, 0.04, 'lowpass', analog=False)
# retrive data from accelerometer on ankle    
    for k in range(19,22):
        df_rope[:,k] = signal.lfilter(b, a, df_rope[:, k])
    plt.plot(df_rope[500:1500, 20:23])
    plt.show()

---
# Feature Engineering

In [4]:
def featureEngineering():
    training = np.empty(shape=(0, 10))
    testing = np.empty(shape=(0, 10))
    # load all daliac 19 datasets 
    for i in range(19):
        df = pd.read_csv('daliac/dataset_' + str(i + 1) + '.txt', sep=',', header=None)
        print('deal with dataset ' + str(i + 1))
        # get all activity data from all datasets
        for c in range(1, 14):
            activity_data = df[df[24] == c].values
            # use Butterworth low pass filter to filter noise
            b, a = signal.butter(4, 0.04, 'low', analog=False)
            # apply noise filter to all activities
            for j in range(24):
                activity_data[:, j] = signal.lfilter(b, a, activity_data[:, j])            
                
                # get length of each data
                datat_len = len(activity_data)
                # divide data in 80% for training and 20% for testing
                training_len = math.floor(datat_len * 0.8)
                # set train and test data
                training_data = activity_data[:training_len, :]
                testing_data = activity_data[training_len:, :]
                training_sample_number = training_len // 1000 + 1
                testing_sample_number = (datat_len - training_len) // 1000 + 1
                
                # create sample training data
                for s in range(training_sample_number):
                    if s < training_sample_number - 1:
                        sample_data = training_data[1000*s:1000*(s + 1), :]
                        
                    else:
                        sample_data = training_data[1000*s:, :]
                    feature_sample = []
                    
                    # retrieve x, y and z coordinate data from accelerometer on ankle sensor
                    for i in range(19,22):
                        feature_sample.append(np.min(sample_data[:, i])) # min value of x, y and z
                        feature_sample.append(np.max(sample_data[:, i])) # max value of x, y and z
                        feature_sample.append(np.mean(sample_data[:, i])) # mean value of x, y and z
                    feature_sample.append(sample_data[0, -1])
                    feature_sample = np.array([feature_sample])
                    training = np.concatenate((training, feature_sample), axis=0)
                        
                # create sample testing data
                for s in range(testing_sample_number):
                    if s < training_sample_number - 1:
                        sample_data = testing_data[1000*s:1000*(s + 1), :]
                    else:
                        sample_data = testing_data[1000*s:, :]
        
                    feature_sample = []
                    # retrieve x, y and z coordinate data from accelerometer on ankle sensor
                    for i in range(19,22):
                        feature_sample.append(np.min(sample_data[:, i]))
                        feature_sample.append(np.max(sample_data[:, i]))
                        feature_sample.append(np.mean(sample_data[:, i]))
                    feature_sample.append(sample_data[0, -1])
                    feature_sample = np.array([feature_sample])
                    testing = np.concatenate((testing, feature_sample), axis=0)
                    
    df_training = pd.DataFrame(training)
    df_testing = pd.DataFrame(testing)
    df_training.to_csv('training_data.csv', index=None, header=None)
    df_testing.to_csv('testing_data.csv', index=None, header=None)
    print(feature_sample)
    

In [5]:
def modelEvaluation():
    # Read training and testing data
    df_training = pd.read_csv('training_data.csv', header=None)
    df_testing = pd.read_csv('testing_data.csv', header=None)
    
    # Get the number of each activity (the last column in the dataset)
    y_train = df_training[df_training.shape[1] - 1].values
    # Labels should start from 0 in sklearn
    y_train = y_train - 1
    # Change the type to integer
    y_train = y_train.astype(int)
    # Drop the last column in the dataset
    df_training = df_training.drop([df_training.shape[1] - 1], axis=1)
    # Get values for model training
    X_train = df_training.values
    print ("training done ")
    
    # Do the same as training for testing
    y_test = df_testing[df_testing.shape[1] - 1].values
    y_test = y_test - 1
    y_test = y_test.astype(int)
    df_testing = df_testing.drop([df_testing.shape[1] - 1], axis=1)
    X_test = df_testing.values
    print ("testing done")

    # Feature normalization for improving the performance of machine learning models. In this code, 
    # StandardScaler is used to scale original feature to be centered around zero. 
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    print("End feature normalisation")

    # Build KNN classifier, in this code
    knn = KNeighborsClassifier(n_neighbors=4)
    knn.fit(X_train, y_train)

    # Evaluation. when we train a machine learning model on training set, we should evaluate its performance on testing set.
    # We could evaluate the model by different metrics. Firstly, we could calculate the classification accuracy.
    # In this code, when n_neighbors is set to 3, the accuracy achieves 0.753.
    # when n_neighbors is set to 4, the accuracy achieve 0.797
    y_pred = knn.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    # We could use confusion matrix to view the classification for each activity.
    print(confusion_matrix(y_test, y_pred))

    tuned_parameters = [{'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]
    acc_scorer = make_scorer(accuracy_score)
    # Using GridSearch to search for optimal KNN classifier based on tuned parameters and accuracy score
    grid_obj  = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=10, scoring=acc_scorer)
    # Fit the model for training
    grid_obj  = grid_obj .fit(X_train, y_train)
    # Get best classifier
    clf = grid_obj.best_estimator_
    print ("some results")
    print('best clf:', clf)
    # Fit the classifier model
    clf.fit(X_train, y_train)
    # Create predited class
    y_pred = clf.predict(X_test)
    print(y_pred)
    print(y_pred.shape, y_test.shape)
    # Print out accuracy score and confusion matrix.
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    # Another machine learning model: svm. In this code, we use gridsearch to find the optimial classifier
    # It will take a long time to find the optimal classifier.
    # the accuracy for SVM classifier with default parameters is 0.753, 
    # which is worse than KNN. The reason may be parameters of svm classifier are not optimal.  
    # Another reason may be we only use 9 features and they are not enough to build a good svm classifier. 
    
    # Do the same as training for testing 
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2, 1e-3, 1e-4],
                    'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 100]},
                {'kernel': ['linear'], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}]
    acc_scorer = make_scorer(accuracy_score)
    grid_obj  = GridSearchCV(SVC(), tuned_parameters, cv=10, scoring=acc_scorer)
    grid_obj  = grid_obj .fit(X_train, y_train)
    clf = grid_obj.best_estimator_
    print('best clf:', clf)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [None]:
#     print("visualise data")
#     visualisation()
#     print("noise removed")
#     removeNoise()
print("Engineering feature")
featureEngineering()
print("Final Model")
modelEvaluation()

Engineering feature
deal with dataset 1
deal with dataset 2
deal with dataset 3
deal with dataset 4
deal with dataset 5
deal with dataset 6
deal with dataset 7
deal with dataset 8
deal with dataset 9
deal with dataset 10
deal with dataset 11
deal with dataset 12
deal with dataset 13
deal with dataset 14
deal with dataset 15
deal with dataset 16
deal with dataset 17
deal with dataset 18
deal with dataset 19
[[-7.64914274e-01 -2.10587942e-01 -5.42663803e-01  2.70547580e-02
   8.16927305e-02  5.67641902e-02  7.14463855e+00  2.88400090e+01
   1.85248410e+01  1.30000000e+01]]
Final Model
training done 
testing done
End feature normalisation
Accuracy:  0.7970649392194498
[[1303    0   24    0   19    0   22    0    0    0    0    0    0]
 [   0 1368    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0 1128  200   19   21    0    0    0    0    0    0    0]
 [  24    0  171 2109   23   49    0    0    0    0    0    0    0]
 [   6    0   26   31  479  479  138   60   57    0   