# Training

## Imports

In [40]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import LeaveOneGroupOut, KFold, train_test_split
from sklearn import metrics
from sklearn.feature_selection import RFE
# Works only with python 3.7.9. and sklearn on v 0.22.0 !!!!!
from sklearn_porter import Porter

from tqdm import tqdm

import data_helper

## Data

In [41]:
data = data_helper.prepare_data()
df = data_helper.aggregate(data)

scaler = StandardScaler()
# scaler = MinMaxScaler()

X = df.drop(columns=["label", "subject"])
X_scaled = scaler.fit_transform(X)
y = df["label"]
cc = y.unique()

## ML-Methods (classifier only)

As there is a classification problem no regression is used

In [42]:
# for repeatability we use a fixed random state
rs = 15

knnc5 = KNeighborsClassifier(5, n_jobs=-1)
knnc10 = KNeighborsClassifier(10, n_jobs=-1)
svml = SVC(kernel="linear", random_state=rs)
svmr = SVC(kernel="rbf", random_state=rs)
dtc = DecisionTreeClassifier(max_depth=10, random_state=rs)
rf = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=rs)
gnb = GaussianNB()

classifier = [
    ("5 Nearest Neighbor", knnc5),
    ("10 Nearest Neighbor", knnc10),
    ("Linear SVM", svml),
    ("RBF SVM", svmr),
    ("Decision Tree", dtc),
    ("Random Forest", rf),
    ("Gaussian Naive Bayes", gnb)
]

## Methods

In [43]:
def evaluateSplits(clf_list = classifier, X=X_scaled, y=y, splits=None):
    scores = np.zeros((len(splits), len(clf_list)))
    rep = [None] * len(classifier)
    for split_idx, (train_index, test_index) in tqdm(enumerate(splits), total=len(splits)):
        y_train, y_test = y[train_index], y[test_index]
        X_train, X_test = X[train_index], X[test_index]

        for clf_idx, (name, clf) in enumerate(clf_list):
            clf.fit(X_train, y_train)

            pred = clf.predict(X_test)

            acc_score = metrics.accuracy_score(pred, y_test)

            scores[split_idx, clf_idx] = acc_score
    return np.mean(scores, axis=0), rep

def printResults(res, clf_list = classifier):
    max_acc = 0
    idx = -1
    for i in range(len(clf_list)):
        acc = res[0][i]
        if acc > max_acc:
            max_acc = acc
            idx = i
        print(clf_list[i][0])
        print('Mean accuracy:', acc)
        print('Confusion Matrix:')
        print(res[1][i])
        print("\n")
    
    print("\nBest Result:")
    print(print(clf_list[idx][0]))
    print('Mean accuracy:', max_acc)

### Leave one Group Out

In [44]:
logo_splits = list(LeaveOneGroupOut().split(X_scaled,y, groups=df["subject"]))

logo_res = evaluateSplits(splits=logo_splits)

100%|██████████| 56/56 [00:37<00:00,  1.51it/s]


In [45]:
printResults(logo_res)

5 Nearest Neighbor
Mean accuracy: 0.6284839794841881
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.6534577512151951
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.629685235567785
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.6512773613021613
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.5923127475465838
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.695439883941277
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.5792206860299193
Confusion Matrix:
None



Best Result:
Random Forest
None
Mean accuracy: 0.695439883941277


### k-fold cross-validation

In [46]:
kfold_splits_10 = list(KFold(n_splits=10, shuffle=True, random_state=rs).split(X_scaled, y))
kfold_splits_20 = list(KFold(n_splits=20, shuffle=True, random_state=rs).split(X_scaled, y))

k10_res = evaluateSplits(splits=kfold_splits_10)
k20_res = evaluateSplits(splits=kfold_splits_20)

100%|██████████| 10/10 [00:06<00:00,  1.63it/s]
100%|██████████| 20/20 [00:12<00:00,  1.55it/s]


In [47]:
printResults(k10_res)

5 Nearest Neighbor
Mean accuracy: 0.8606885903967887
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.8579311409603212
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.7819839431835728
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.8256677474139262
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.8307812258761773
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.8441608769492049
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.6851767793731665
Confusion Matrix:
None



Best Result:
5 Nearest Neighbor
None
Mean accuracy: 0.8606885903967887


In [48]:
printResults(k20_res)

5 Nearest Neighbor
Mean accuracy: 0.8618879183070867
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.8610912893700787
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.7823665108267719
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.8315637303149608
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.8296136811023624
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.8484990157480314
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.6859621062992128
Confusion Matrix:
None



Best Result:
5 Nearest Neighbor
None
Mean accuracy: 0.8618879183070867


## Eliminate Features

As the easiest way at the moment to eliminate features is via a RF classifier, it will be uesed here as an example.

In [49]:
rf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features= 1, random_state=rs)
rfe = RFE(rf, n_features_to_select=10)
rfe.fit(X_scaled,y)
columns = list(zip(rfe.support_, X.columns))
print(columns)

[(False, 'alpha_min'), (False, 'alpha_max'), (False, 'alpha_mean'), (True, 'alpha_median'), (False, 'alpha_std'), (True, 'alpha_var'), (False, 'beta_min'), (False, 'beta_max'), (False, 'beta_mean'), (False, 'beta_median'), (True, 'beta_std'), (False, 'beta_var'), (True, 'gamma_min'), (False, 'gamma_max'), (False, 'gamma_mean'), (False, 'gamma_median'), (False, 'gamma_std'), (False, 'gamma_var'), (False, 'x_min'), (False, 'x_max'), (False, 'x_mean'), (False, 'x_median'), (False, 'x_std'), (True, 'x_var'), (False, 'y_min'), (True, 'y_max'), (False, 'y_mean'), (True, 'y_median'), (True, 'y_std'), (False, 'y_var'), (True, 'z_min'), (False, 'z_max'), (True, 'z_mean'), (False, 'z_median'), (False, 'z_std'), (False, 'z_var')]


In [50]:
removed = [c for b, c in columns if ~b]
removed

['alpha_min',
 'alpha_max',
 'alpha_mean',
 'alpha_std',
 'beta_min',
 'beta_max',
 'beta_mean',
 'beta_median',
 'beta_var',
 'gamma_max',
 'gamma_mean',
 'gamma_median',
 'gamma_std',
 'gamma_var',
 'x_min',
 'x_max',
 'x_mean',
 'x_median',
 'x_std',
 'y_min',
 'y_mean',
 'y_var',
 'z_max',
 'z_median',
 'z_std',
 'z_var']

In [51]:
X_red = X.drop(columns=removed)
X_red_scaled = scaler.fit_transform(X_red)
X_red.columns

Index(['alpha_median', 'alpha_var', 'beta_std', 'gamma_min', 'x_var', 'y_max',
       'y_median', 'y_std', 'z_min', 'z_mean'],
      dtype='object')

In [52]:
kfold_splits_10_red = list(KFold(n_splits=10, shuffle=True, random_state=rs).split(X_red_scaled, y))
kfold_splits_20_red = list(KFold(n_splits=20, shuffle=True, random_state=rs).split(X_red_scaled, y))

k10_res_red = evaluateSplits(splits=kfold_splits_10_red, X=X_red_scaled)
k20_res_red = evaluateSplits(splits=kfold_splits_20_red, X=X_red_scaled)

100%|██████████| 10/10 [00:04<00:00,  2.42it/s]
100%|██████████| 20/20 [00:08<00:00,  2.37it/s]


In [53]:
printResults(k10_res_red)

5 Nearest Neighbor
Mean accuracy: 0.8547892542843911
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.8602902578354177
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.7666311563995677
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.844951366373321
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.8299938243013741
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.8512444032731203
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.6879326848849775
Confusion Matrix:
None



Best Result:
10 Nearest Neighbor
None
Mean accuracy: 0.8602902578354177


In [54]:
printResults(k20_res_red)

5 Nearest Neighbor
Mean accuracy: 0.8543953001968505
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.858716781496063
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.7670183316929137
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.8433809055118111
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.8394500492125985
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.8441652312992127
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.6898960383858269
Confusion Matrix:
None



Best Result:
10 Nearest Neighbor
None
Mean accuracy: 0.858716781496063


## Bagging

In [55]:
bagging_list = [
    ("Bagging Gausian Naive Bayes5", BaggingClassifier(base_estimator=gnb, n_estimators=5, random_state=rs, n_jobs=-1)),
    ("Bagging Gausian Naive Bayes10", BaggingClassifier(base_estimator=gnb, n_estimators=10, random_state=rs, n_jobs=-1)),
    ("Bagging Gausian Naive Bayes15", BaggingClassifier(base_estimator=gnb, n_estimators=15, random_state=rs, n_jobs=-1)),
    ("Bagging Gausian Naive Bayes20", BaggingClassifier(base_estimator=gnb, n_estimators=20, random_state=rs, n_jobs=-1))
]

bag_res = evaluateSplits(clf_list=bagging_list, splits=kfold_splits_20)
bag_red_res = evaluateSplits(clf_list=bagging_list, splits=kfold_splits_20_red, X=X_red_scaled)

100%|██████████| 20/20 [00:35<00:00,  1.78s/it]
100%|██████████| 20/20 [00:35<00:00,  1.75s/it]


In [56]:
printResults(bag_res, clf_list=bagging_list)

Bagging Gausian Naive Bayes5
Mean accuracy: 0.6891117125984255
Confusion Matrix:
None


Bagging Gausian Naive Bayes10
Mean accuracy: 0.685962106299213
Confusion Matrix:
None


Bagging Gausian Naive Bayes15
Mean accuracy: 0.6867495078740161
Confusion Matrix:
None


Bagging Gausian Naive Bayes20
Mean accuracy: 0.6875369094488193
Confusion Matrix:
None



Best Result:
Bagging Gausian Naive Bayes5
None
Mean accuracy: 0.6891117125984255


In [57]:
printResults(bag_red_res, clf_list=bagging_list)

Bagging Gausian Naive Bayes5
Mean accuracy: 0.6950079970472444
Confusion Matrix:
None


Bagging Gausian Naive Bayes10
Mean accuracy: 0.6930394931102365
Confusion Matrix:
None


Bagging Gausian Naive Bayes15
Mean accuracy: 0.6950079970472444
Confusion Matrix:
None


Bagging Gausian Naive Bayes20
Mean accuracy: 0.6914708415354334
Confusion Matrix:
None



Best Result:
Bagging Gausian Naive Bayes5
None
Mean accuracy: 0.6950079970472444


In [58]:
bagging_list2 = [
    ("Bagging KNN5 5", BaggingClassifier(base_estimator=knnc5, n_estimators=5, random_state=rs, n_jobs=-1)),
    ("Bagging KNN5 10", BaggingClassifier(base_estimator=knnc5, n_estimators=10, random_state=rs, n_jobs=-1)),
    ("Bagging KNN5 15", BaggingClassifier(base_estimator=knnc5, n_estimators=15, random_state=rs, n_jobs=-1)),
    ("Bagging KNN5 20", BaggingClassifier(base_estimator=knnc5, n_estimators=20, random_state=rs, n_jobs=-1))
]

bag_res2 = evaluateSplits(clf_list=bagging_list2, splits=kfold_splits_20)
bag_red_res2 = evaluateSplits(clf_list=bagging_list2, splits=kfold_splits_20_red, X=X_red_scaled)

100%|██████████| 20/20 [01:07<00:00,  3.39s/it]
100%|██████████| 20/20 [00:59<00:00,  2.97s/it]


In [59]:
printResults(bag_res2, clf_list=bagging_list2)

Bagging KNN5 5
Mean accuracy: 0.8662093996062993
Confusion Matrix:
None


Bagging KNN5 10
Mean accuracy: 0.8634534940944883
Confusion Matrix:
None


Bagging KNN5 15
Mean accuracy: 0.8650252214566929
Confusion Matrix:
None


Bagging KNN5 20
Mean accuracy: 0.8662063238188977
Confusion Matrix:
None



Best Result:
Bagging KNN5 5
None
Mean accuracy: 0.8662093996062993


In [60]:
printResults(bag_red_res2, clf_list=bagging_list2)

Bagging KNN5 5
Mean accuracy: 0.861472687007874
Confusion Matrix:
None


Bagging KNN5 10
Mean accuracy: 0.8602885088582678
Confusion Matrix:
None


Bagging KNN5 15
Mean accuracy: 0.861472687007874
Confusion Matrix:
None


Bagging KNN5 20
Mean accuracy: 0.8575387549212599
Confusion Matrix:
None



Best Result:
Bagging KNN5 5
None
Mean accuracy: 0.861472687007874


## not Scaled

In [61]:
logo_splits_ns = list(LeaveOneGroupOut().split(X,y, groups=df["subject"]))

logo_res_ns = evaluateSplits(splits=logo_splits_ns)

kfold_splits_10_ns = list(KFold(n_splits=10, shuffle=True, random_state=rs).split(X, y))
kfold_splits_20_ns = list(KFold(n_splits=20, shuffle=True, random_state=rs).split(X, y))

k10_res_ns = evaluateSplits(splits=kfold_splits_10_ns)
k20_res_ns = evaluateSplits(splits=kfold_splits_20_ns)

100%|██████████| 56/56 [00:37<00:00,  1.50it/s]
100%|██████████| 10/10 [00:06<00:00,  1.62it/s]
100%|██████████| 20/20 [00:12<00:00,  1.54it/s]


In [62]:
printResults(logo_res_ns)

5 Nearest Neighbor
Mean accuracy: 0.6284839794841881
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.6534577512151951
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.629685235567785
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.6512773613021613
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.5923127475465838
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.695439883941277
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.5792206860299193
Confusion Matrix:
None



Best Result:
Random Forest
None
Mean accuracy: 0.695439883941277


In [63]:
printResults(k10_res_ns)

5 Nearest Neighbor
Mean accuracy: 0.8606885903967887
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.8579311409603212
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.7819839431835728
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.8256677474139262
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.8307812258761773
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.8441608769492049
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.6851767793731665
Confusion Matrix:
None



Best Result:
5 Nearest Neighbor
None
Mean accuracy: 0.8606885903967887


In [64]:
printResults(k20_res_ns)

5 Nearest Neighbor
Mean accuracy: 0.8618879183070867
Confusion Matrix:
None


10 Nearest Neighbor
Mean accuracy: 0.8610912893700787
Confusion Matrix:
None


Linear SVM
Mean accuracy: 0.7823665108267719
Confusion Matrix:
None


RBF SVM
Mean accuracy: 0.8315637303149608
Confusion Matrix:
None


Decision Tree
Mean accuracy: 0.8296136811023624
Confusion Matrix:
None


Random Forest
Mean accuracy: 0.8484990157480314
Confusion Matrix:
None


Gaussian Naive Bayes
Mean accuracy: 0.6859621062992128
Confusion Matrix:
None



Best Result:
5 Nearest Neighbor
None
Mean accuracy: 0.8618879183070867


## Export best result

As there is no big difference between scaled and not, the js version uses non scaled data for easier use.

In [65]:
best_clf = GaussianNB()
best_clf.fit(X, y)

print(best_clf.classes_)

porter = Porter(best_clf, language='js')
export = porter.export(embed_data=True)
f = open("naive_bayes_clf.js", "w")
f.write(export)
f.close()

['sitting' 'standing' 'walking']


In [66]:
best_clf2 = KNeighborsClassifier(5, n_jobs=-1)
best_clf2.fit(X, y)

print(best_clf2.classes_)

porter = Porter(best_clf2, language='js')
export = porter.export(embed_data=True)
f = open("knn5_clf.js", "w")
f.write(export)
f.close()

['sitting' 'standing' 'walking']


