In [1]:
from Chapter2.CreateDataset import CreateDataset
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
from util.VisualizeDataset import VisualizeDataset
from util import util
import time
start = time.time()
plt.rcParams["figure.figsize"] = [15,5]

from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
from Chapter7.LearningAlgorithms import ClassificationAlgorithms
from Chapter7.Evaluation import ClassificationEvaluation
from Chapter7.FeatureSelection import FeatureSelectionClassification

In [2]:
data_path = "data/merged_data_Peter/"

In [3]:
accraw = pd.read_csv(data_path + "Accelerometer.csv")
timecol = "Time (s)"
accfeatures = list()
for col in accraw:
    if (col != timecol) and \
    (col != "Absolute acceleration (m/s^2)") and \
    (col != "Unnamed: 0") and \
    (col != "Unnamed: 1"):
        accfeatures.append(col)
accfeatures

['Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)']

In [4]:
gyraw = pd.read_csv(data_path + "Gyroscope.csv")
gyfeatures = list()
for col in gyraw:
    if (col != timecol) and \
    (col != "Absolute (rad/s)") and \
    (col != "Unnamed: 0") and \
    (col != "Unnamed: 1"):
        gyfeatures.append(col)
gyfeatures

['Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)']

In [None]:
Dataset = CreateDataset(data_path , 500)

Dataset.add_numerical_dataset("Accelerometer.csv", timecol, accfeatures, "avg", "acc_")
Dataset.add_numerical_dataset("Gyroscope.csv", timecol, gyfeatures, "avg", "gyr_")
Dataset.add_event_dataset('labels.csv', 'label_start', 'label_end', 'label', 'binary')

dataset = Dataset.data_table
Dataviz = VisualizeDataset()

Dataviz.plot_dataset_boxplot(dataset, ['acc_Acceleration x (m/s^2)', 'acc_Acceleration y (m/s^2)', 'acc_Acceleration z (m/s^2)'])
Dataviz.plot_dataset_boxplot(dataset, ['gyr_Gyroscope x (rad/s)', 'gyr_Gyroscope y (rad/s)', 'gyr_Gyroscope z (rad/s)'])

Dataviz.plot_dataset(dataset, ["acc_", "gyr_", "label"], \
                        ["like", "like", "like"], \
                        ["line", "line", "points"])

util.print_statistics(dataset)

Reading data from Accelerometer.csv


In [None]:
dataset

In [None]:
N_FORWARD_SELECTION = 50

In [None]:
DataViz = VisualizeDataset()
prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['label'], 'like', 0.7, filter=True, temporal=False)

print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

In [None]:
basic_features = ['acc_Acceleration x (m/s^2)', 'acc_Acceleration y (m/s^2)', 'acc_Acceleration z (m/s^2)', 'gyr_Gyroscope x (rad/s)', 'gyr_Gyroscope y (rad/s)', 'gyr_Gyroscope z (rad/s)']

In [None]:
fs = FeatureSelectionClassification()
features, ordered_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION, 
                                                                  train_X[basic_features], 
                                                                  test_X[basic_features],
                                                                  train_y,
                                                                  test_y,
                                                                  gridsearch=False)

In [None]:
DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION+1)], y=[ordered_scores],
                xlabel='number of features', ylabel='accuracy')

In [None]:
learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()
start = time.time()


reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
performance_training = []
performance_test = []
## Due to runtime constraints we run the experiment 3 times, yet if you want even more robust data one should increase the repetitions. 
N_REPEATS_NN = 3

In [None]:
for reg_param in reg_parameters:
    performance_tr = 0
    performance_te = 0
    for i in range(0, N_REPEATS_NN):

        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
            train_X, train_y,
            test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500,
            gridsearch=False
        )

        performance_tr += eval.accuracy(train_y, class_train_y)
        performance_te += eval.accuracy(test_y, class_test_y)
    performance_training.append(performance_tr/N_REPEATS_NN)
    performance_test.append(performance_te/N_REPEATS_NN)

In [None]:
DataViz.plot_xy(x=[reg_parameters, reg_parameters], y=[performance_training, performance_test], method='semilogx',
                xlabel='regularization parameter value', ylabel='accuracy', ylim=[0.95, 1.01],
                names=['training', 'test'], line_styles=['r-', 'b:'])

In [None]:
N_KCV_REPEATS = 5

print('Preprocessing took', time.time()-start, 'seconds.')

scores_over_all_algs = []

In [None]:
selected_train_X = train_X[basic_features]
selected_test_X = test_X[basic_features]

# First we run our non deterministic classifiers a number of times to average their score.

performance_tr_nn = 0
performance_tr_rf = 0
performance_tr_svm = 0
performance_te_nn = 0
performance_te_rf = 0
performance_te_svm = 0

for repeat in range(0, N_KCV_REPEATS):
    print("Training NeuralNetwork run {} / {} ... ".format(repeat, N_KCV_REPEATS))
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
        selected_train_X, train_y, selected_test_X, gridsearch=True
    )
    print("Training RandomForest run {} / {} ... ".format(repeat, N_KCV_REPEATS))
    performance_tr_nn += eval.accuracy(train_y, class_train_y)
    performance_te_nn += eval.accuracy(test_y, class_test_y)

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
        selected_train_X, train_y, selected_test_X, gridsearch=True
    )

    performance_tr_rf += eval.accuracy(train_y, class_train_y)
    performance_te_rf += eval.accuracy(test_y, class_test_y)

    print("Training SVM run {} / {}".format(repeat, N_KCV_REPEATS))

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.support_vector_machine_with_kernel(
        selected_train_X, train_y, selected_test_X, gridsearch=True
    )
    performance_tr_svm += eval.accuracy(train_y, class_train_y)
    performance_te_svm += eval.accuracy(test_y, class_test_y)


overall_performance_tr_nn = performance_tr_nn/N_KCV_REPEATS
overall_performance_te_nn = performance_te_nn/N_KCV_REPEATS
overall_performance_tr_rf = performance_tr_rf/N_KCV_REPEATS
overall_performance_te_rf = performance_te_rf/N_KCV_REPEATS
overall_performance_tr_svm = performance_tr_svm/N_KCV_REPEATS
overall_performance_te_svm = performance_te_svm/N_KCV_REPEATS

#     #And we run our deterministic classifiers:
print("Determenistic Classifiers:")

print("Training Nearest Neighbor run 1 / 1")
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(
    selected_train_X, train_y, selected_test_X, gridsearch=True
)
performance_tr_knn = eval.accuracy(train_y, class_train_y)
performance_te_knn = eval.accuracy(test_y, class_test_y)
print("Training Descision Tree run 1 / 1")
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
    selected_train_X, train_y, selected_test_X, gridsearch=True
)

performance_tr_dt = eval.accuracy(train_y, class_train_y)
performance_te_dt = eval.accuracy(test_y, class_test_y)
print("Training Naive Bayes run 1/1")
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(
    selected_train_X, train_y, selected_test_X
)

performance_tr_nb = eval.accuracy(train_y, class_train_y)
performance_te_nb = eval.accuracy(test_y, class_test_y)

scores_with_sd = util.print_table_row_performances(len(selected_train_X.index), len(selected_test_X.index), [
                                                                                            (overall_performance_tr_nn, overall_performance_te_nn),
                                                                                            (overall_performance_tr_rf, overall_performance_te_rf),
                                                                                            (overall_performance_tr_svm, overall_performance_te_svm),
                                                                                            (performance_tr_knn, performance_te_knn),
                                                                                            (performance_tr_knn, performance_te_knn),
                                                                                            (performance_tr_dt, performance_te_dt),
                                                                                            (performance_tr_nb, performance_te_nb)])
scores_over_all_algs.append(scores_with_sd)


In [None]:
scores_with_sd = util.print_table_row_performances("Normal", len(selected_train_X.index), len(selected_test_X.index), [
                                                                                            (overall_performance_tr_nn, overall_performance_te_nn),
                                                                                            (overall_performance_tr_rf, overall_performance_te_rf),
                                                                                            (overall_performance_tr_svm, overall_performance_te_svm),
                                                                                            (performance_tr_knn, performance_te_knn),
                                                                                            (performance_tr_knn, performance_te_knn),
                                                                                            (performance_tr_dt, performance_te_dt),
                                                                                            (performance_tr_nb, performance_te_nb)])
scores_over_all_algs.append(scores_with_sd)

In [None]:
DataViz.plot_performances_classification(['NN', 'RF','SVM', 'KNN', 'DT', 'NB'], "Normal", scores_over_all_algs)

In [None]:
test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns)

DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)