# Create models from *Ergo* data
This notebook contains the code for models used to predict the *Ergo* data. See the report [here](https://git.cs.sun.ac.za/Computer-Science/rw771/2022/26723077-TG7-doc) or the source code behind the data [here](https://git.cs.sun.ac.za/Computer-Science/rw771/2022/26723077-TG7-src).

### Items to explore

- What is the optimal number of PCs for model performance?
- Which is the optimal dimensionality reduction method: tSNE, Autoencoder, PCA?
- Compare different model types: Random Forest, SVM, NN, Nïeve Bayes, Quadratic Discriminent Analysis
- Restructure this to have a better order
    - imports, get data, PCA on data, train model, visualise model's conf matrix, visualise model's mislabeled observations
- Note: The model might struggle with how gesture classification is independant of when the gesture was performed. How to modify training data to account for this? and how to verify that models aren't fixating on *when* a gesture happens, as opposed to *which* gestures happens?

## Imports and constants

In [None]:
%load_ext autoreload
%autoreload 2

from common_utils import *

import pickle
import re
from time import time
from matplotlib import cm

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Distributions
from sklearn.utils.fixes import loguniform

# Read in the data to an `np.array`

In [None]:
X, y, paths = read_to_numpy()

# Train-test split and scale the data

In [None]:
X_train, X_test, y_train, y_test, paths_train, paths_test, scaler = train_test_split_scale(X, y, paths)

# Train models without dimensionality reduction


Training KNeighborsClassifier(n_neighbors=21)
- Time taken: 153.927s
- Best performing model
`KNeighborsClassifier(algorithm='ball_tree', n_neighbors=21)`
- Score: train: 0.9679, test: 0.9774


Training MLPClassifier(max_iter=1000)
- Time taken: 81.279s
- Best performing model
```MLPClassifier(activation='tanh', alpha=0.0013330009770265291, hidden_layer_sizes=400, max_iter=1000)```
- Score: train: 0.9907, test: 0.9947

Training MLPClassifier(max_iter=1000)
- Time taken: 27.189s
- Best performing model
`MLPClassifier(alpha=0.0016877545702567223, hidden_layer_sizes=100, max_iter=1000)`
- Score: train: 0.9903, test: 0.9950


In [None]:
%%time
models = []

models.append(
    (MLPClassifier(max_iter=1000), {
        'hidden_layer_sizes': [(100), (200), (400), (100, 50), (200, 100), (400, 200), 
                               (100, 50, 25), (200, 100, 50), (400, 200, 100)],
        'activation' : ['logistic', 'tanh', 'relu'],
        'solver' : ['lbfgs', 'adam'],
        'alpha': loguniform(1e-6, 1e-2),
    })
)

clfs = []
for model, param_grid in models:
    print(f'\nTraining {model}')
    start = time()
    clf = RandomizedSearchCV(
        model, param_grid, n_iter=10
    )
    clf = clf.fit(X_train, y_train)
    print(f'- Time taken: {time() - start:.3f}s\n- Best performing model\n`{clf.best_estimator_}`\n- Score: train: {clf.best_score_:.4f}, test: {clf.score(X_test, y_test):.4f}')
    clfs.append(clf.best_estimator_)
    

### Confusion matrix of the model

Get a `pd.DataFrame` with counts of the most often mislabeled gestures

In [None]:
conf_mat = pd.DataFrame(confusion_matrix(y_test, clf.predict(X_test)))
np.fill_diagonal(conf_mat.values, 0)

conf_mat.index = gesture_to_idx.keys()
conf_mat.columns = gesture_to_idx.keys()
# conf_mat
mislabeled = conf_mat.stack()
mislabeled = mislabeled[mislabeled > 0].reset_index()
mislabeled.columns = ['true', 'predicted', 'count']
mislabeled = mislabeled.sort_values(
    ['count', 'true', 'predicted'], 
    ascending=[False, True, True]
)
mislabeled

In [None]:
clf = clfs[0]
y_pred = clf.predict(X_test)
clf_name = f'{str(type(clf))}'.split('.')[-1][:-2]

print(f"Test set results for {clf_name}")
print(classification_report(y_test, y_pred, target_names=gesture_to_idx.keys()))

fig, ax = plt.subplots(figsize=(12,12))
ConfusionMatrixDisplay.from_estimator(
    clf, 
    X_test,
    y_test, 
    display_labels=gesture_to_idx.keys(), 
    xticks_rotation="vertical",
    ax=ax,
)
ax.grid(False)
plt.title(f'Confusion Matrix of \n{clf}')
plt.tight_layout()

plt.savefig(f'imgs/conf_mat_{clf}.pdf')

### Visualise important features from full-dimensionality classifiers

In [None]:
mlp = clfs[0]

In [None]:
num_cols = 4
fig, axes = plt.subplots(n_classes//num_cols+1, num_cols)
# use global min / max to ensure all weights are shown on the same scale

vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
for gesture_idx, ax in enumerate(axes.ravel()):
    if gesture_idx >= n_classes:
        ax.set_xticks(())
        ax.set_yticks(())
        ax.grid(False)
        continue
        
    multiplied = mlp.coefs_[0]
    for layer in range(1, len(mlp.coefs_)):
        multiplied = multiplied @ mlp.coefs_[layer]

    importances = multiplied[:, gesture_idx].reshape(n_timesteps, n_sensors)
    
    gesture_label = idx_to_gesture[gesture_idx]
    gesture_description = gesture_info[gesture_label]['description']
    plot_raw_gesture(
        importances.reshape(n_timesteps, n_sensors),
        f'{gesture_label}\n{gesture_description}',
        ax=ax,
        show_cbar=False,
        show_xticks=False,
        show_yticks=False,
        delim_lw=1
    )

plt.suptitle('Importances per gesture for the trained MLP')
plt.tight_layout()
plt.savefig('imgs/importances.pdf')


## Save the trained model

In [None]:
save_model(mlp)


# Compare self-classified observations with manually classified observations

In [None]:
mlp

In [None]:
dir_files_sc = get_dir_files('../gesture_data/self-classified')
gesture = list(dir_files_sc.keys())[0]
file = dir_files_sc[gesture][0]
path = f'../gesture_data/self-classified/{gesture}/{file}'

df = read_to_df(path)
obs = df.to_numpy()
obs_wrapped = np.zeros((1, n_sensors * n_timesteps))
obs_wrapped[0] = obs.flatten()
txd = scaler.transform(obs_wrapped)
plot_raw_gesture(txd[0], f'{gesture}\n{path}')

TODO: create a common method of scaling with a saved scaler before plotting the observation
TODO: create a method of visualising how data flows through the MLP so you can troubleshoot misclassified real-time predictions

In [None]:
scaler.transform(obs_wrapped)

In [None]:
obs = df.to_numpy()
obs_wrapped = np.zeros((1, n_sensors * n_timesteps))
obs_wrapped[0] = obs.flatten()
prediction = clf.predict_proba(scaler.transform(obs_wrapped))
predictions = []
for i, prob in enumerate(prediction[0]):
    predictions.append((i, prob))
predictions.sort(key=lambda ip: -ip[1])
predictions

# Reduce dimensionality via PCA

In [None]:
%%time
n_components = 10

print(f'Fitting PCA with {n_components} components on {X_train.shape[0]} observations')
pca = PCA(
    n_components=n_components,
    svd_solver="randomized", 
    whiten=True
).fit(X_train)

print(f"PCA explained {100*sum(pca.explained_variance_ratio_):.2f}% of the variance with {n_components} PCs")

# PCA-transform the input test and train data
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

## Train multiple models on the data
Each model is defined in its own cell, and appended to the list `models`. 
After all the definitions, every model in the list is trained and evaluated.

In [None]:
models = []

---
Support Vector Machine
- Time taken: 7.731s
- Best performing model
`SVC(C=35891.14381335473, class_weight='balanced', gamma=0.057667711437645666)`
- Score: train: 0.9896, test: 0.9901

In [None]:
models.append(
    (SVC(kernel="rbf", class_weight="balanced"), {
        "C": loguniform(1e3, 1e5),
        "gamma": loguniform(1e-4, 1e-1),
    })
)

---
Linear Support Vector Machine

- Time taken: 367.828s
- Best performing model: SVC(C=1284.851851420933, class_weight='balanced', kernel='linear')
- Score: train: 0.9780, test: 0.9784

In [None]:
models.append(
    (SVC(kernel="linear", class_weight="balanced"), {
        "C": loguniform(1e3, 1e5),
    })
)

---
K-Nearest Neighbours
- Time taken: 0.820s
- Best performing model
`KNeighborsClassifier(algorithm='ball_tree', n_neighbors=21)`
- Score: train: 0.9849, test: 0.9851

In [None]:
models.append(
    (KNeighborsClassifier(n_neighbors=n_classes), {
        "algorithm": ['ball_tree', 'kd_tree', 'brute'],
    })
)

---
Ada Boost
- Time taken: 268.507s
- Best performing model: AdaBoostClassifier(learning_rate=0.20109497487437183, n_estimators=792)
- Score: train: 0.7856, test: 0.7867


In [None]:
models.append(
    (AdaBoostClassifier(), {
        'n_estimators': range(10, 1000),
        'learning_rate': np.linspace(1e-4, 2, num=200),
    })
)

---
Decision Tree
- Time taken: 1.136s
- Best performing model
`DecisionTreeClassifier(class_weight='balanced', max_depth=186,
                       min_samples_split=0.26530612244897955)`
- Score: train: 0.4396, test: 0.4712

In [None]:
models.append(
    (DecisionTreeClassifier(class_weight="balanced"), {
        'max_depth': range(1, 200),
        'min_samples_split': np.linspace(0, 1),
    })
)

---
Random Forest
- Time taken: 31.299s
- Best performing model

`RandomForestClassifier(max_depth=187, max_features=0.836734693877551,
                       min_samples_split=0.16326530612244897, n_estimators=176)`
- Score: train: 0.6110, test: 0.6095

In [None]:
models.append(
    (RandomForestClassifier(), {
        'n_estimators': range(10, 500), 
        'max_depth': range(1, 200), 
        'max_features': np.linspace(0, 1),
        'min_samples_split': np.linspace(0, 1),
    })
)

---

Multi-layer Perceptron
- Time taken: 484.457s
- Best performing model

```MLPClassifier(activation='tanh', alpha=2.8761865563186644e-05,
              hidden_layer_sizes=(400, 200), max_iter=1000)```
- Score: train: 0.9914, test: 0.9919

In [None]:
models.append(
    (MLPClassifier(max_iter=1000), {
        'hidden_layer_sizes': [(100), (200), (400), (100, 50), (200, 100), (400, 200), 
                               (100, 50, 25), (200, 100, 50), (400, 200, 100)],
        'activation' : ['logistic', 'tanh', 'relu'],
        'solver' : ['lbfgs', 'adam'],
        'alpha': loguniform(1e-6, 1e-2),
    })
)

---
Gaussian Naïve Bayes
- Time taken: 0.031s
- Best performing model: GaussianNB()
- Score: train: 0.9255, test: 0.9264

In [None]:
models.append((GaussianNB(), {}))

---
Quadratic Discriminant Analysis
- Time taken: 0.040s
- Best performing model: QuadraticDiscriminantAnalysis()
- Score: train: 0.9854, test: 0.9837

In [None]:
models.append((QuadraticDiscriminantAnalysis(), {}))

---
## Fit and evaluate all models in `models`
Now actually evaluate the models

In [None]:
clfs = []
for model, param_grid in models:
    print(f'\nTraining {model}')
    start = time()
    clf = RandomizedSearchCV(
        model, param_grid, n_iter=10
    )
    clf = clf.fit(X_train_pca, y_train)
    print(f'- Time taken: {time() - start:.3f}s\n- Best performing model\n`{clf.best_estimator_}`\n- Score: train: {clf.best_score_:.4f}, test: {clf.score(X_test_pca, y_test):.4f}')
    clfs.append(clf.best_estimator_)

Alternatively just load a saved model

In [None]:
model_path = f"saved_models/MLPClassifier(activation='tanh',alpha=5.532519953153552e-05,hidden_layer_sizes=(400,200),max_iter=1000,solver='lbfgs').pickle"
clfs = [load_model(model_path)]
# Read in the scaler
scaler_path = f"saved_models/StandardScaler().pickle"
scaler = load_model(scaler_path)
# Read in the index-to-gesture mapping
with open('saved_models/idx_to_gesture.pickle', 'rb') as f:
    idx_to_gesture = pickle.load(f)

### Get detailed analyses of the trained models

In [None]:
for clf in clfs:
    y_pred = clf.predict(X_test_pca)
    clf_name = f'{str(type(clf))}'.split('.')[-1][:-2]

    print(f"Test set results for {clf_name}")
    print(classification_report(y_test, y_pred, target_names=gesture_to_idx.keys()))

    fig, ax = plt.subplots(figsize=(12,12))
    ConfusionMatrixDisplay.from_estimator(
        clf, 
        X_test_pca, 
        y_test, 
        display_labels=gesture_to_idx.keys(), 
        xticks_rotation="vertical",
        ax=ax,
    )
    plt.title(f'Confusion Matrix of \n{clf}')
    plt.tight_layout()

    plt.savefig(f'imgs/conf_mat_{clf}.pdf')

## Plot the incorrect observations
TODO: this should actually show the model activations for the actual and predicted gestures, and not just some random (maybe unrepresentative) example observation

In [None]:
clf = clfs[0]
y_pred = clf.predict(X_test)

X_test_incorrect = X_test[y_pred != y_test]
y_pred_incorrect = y_pred[y_pred != y_test]
y_test_incorrect = y_test[y_pred != y_test]
paths_test_incorrect = paths_test[y_pred != y_test]


@interact(idx=(0, y_pred_incorrect.shape[0]-1, 1))
def plot_incorrect(idx=0):
    predicted = idx_to_gesture[y_pred_incorrect[idx]]
    pred_desc = gesture_info[predicted]['description']
    
    actual = idx_to_gesture[y_test_incorrect[idx]]
    actu_desc = gesture_info[actual]['description']
    
    path = '/'.join(paths_test_incorrect[idx].split('/')[3:])
    
    # Create 3 vertical axs:
    # - top is an example of the actual gesture, 
    # - middle is the incorrectly predicted gesture,
    # - bottom is an example of the predicted gesture
    fig, axs = plt.subplots(3)
    
    # First plot an example of the actual gesture
    actual_idx = next(i for i, yi in enumerate(y_train) if yi == y_pred_incorrect[idx])
    gesture_label = idx_to_gesture[y_train[actual_idx]]
    gesture_description = gesture_info[idx_to_gesture[y_train[actual_idx]]]["description"]
    plot_raw_gesture(
        X_train[actual_idx], 
        f'Example of {gesture_label} ({gesture_description})',
        ax=axs[0],
        show_xticks=False,
    )

    # Second plot the misclassified gesture
    plot_raw_gesture(
        X_test_incorrect[idx], 
        f'Predicted: {predicted} ({pred_desc})\nActual: {actual} ({actu_desc}\n{paths_test_incorrect[idx]})',
        ax=axs[1],
        show_xticks=False,
    )
    
    # Last plot an example of the predicted gesture
    predicted_idx = next(i for i, yi in enumerate(y_train) if yi == y_test_incorrect[idx])
    gesture_label = idx_to_gesture[y_train[predicted_idx]]
    gesture_description = gesture_info[idx_to_gesture[y_train[predicted_idx]]]["description"]
    plot_raw_gesture(
        X_train[predicted_idx], 
        f'Example of {gesture_label} ({gesture_description})',
        ax=axs[2],
        show_xticks=False,
    )
    
    # Finally, tell matplotlib to recompute the layout
    plt.tight_layout()
    print(paths_test_incorrect[idx])

## Plot 2-component PCA to assess separation
TODO: indicate for a given model which observations were correctly predicted

In [None]:
# PCA can be either 2D or 3D
PLOT_2D = True

# Transform the data via PCA. Either 2 or 3 components are used
pca = PCA(n_components=(2 if PLOT_2D else 3))
X_r = pca.fit(X).transform(X)

# Each observation gets a different colour on the scatter plot, and
# similar colours get different markers to better differentiate them
colours = cm.get_cmap('turbo', n_classes)
markers = ['.', 'x', '*', 'd']


if PLOT_2D:
    # Use 2D subplots
    fig, ax = plt.subplots(figsize=(12,8))
else:
    # Use 3D subplots
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')

# Optionally also plot the observation indices along with the points.
# This helps when removing outliers, but increases the amount of clutter
if PLOT_2D:
    for i, yi in enumerate(y):
        ax.annotate(
            i, 
            (X_r[i, 0], X_r[i, 1]),
            color=colours(yi/n_classes),
            size=5,
            alpha=0.5,
        )

# Iterate over each label/gesture
for i, label_idx in enumerate(idx_to_gesture.keys()):
    # Args either has 2 items (if 2D) or 3 (if 3D)
    args = [
        X_r[y == label_idx, 0], 
        X_r[y == label_idx, 1],
    ]
    if not PLOT_2D:
        args.append(X_r[y == label_idx, 2])
    
    # Get a shortened version of the gesture index for the legend
    gesture_idx = idx_to_gesture[label_idx].replace('gesture', '')
    # Get the short gesture description for the legend
    gesture_desc = gesture_info[idx_to_gesture[label_idx]]["desc"]
    
    # Actually plot the points, either in 2 or 3 dimensions
    ax.scatter(
        *args,
        color=colours(label_idx/n_classes),
        alpha=0.3,
        s=10,
        marker=markers[label_idx % 4],
        label=f'{gesture_idx} ({gesture_desc})'
    )

# ----------------------------------------------------------------
#
#   modified from https://stackoverflow.com/a/4701285/14555505
#
# Shrink current axis's height by 10% on the bottom so the legend will fit
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.2,
                 box.width, box.height * 0.80])
# Put a legend below current axis in the newly made space
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=4)
# ----------------------------------------------------------------

# Give the plot a title and save it
plt.title(f"PCA with {'two' if PLOT_2D else 'three'} components over {n_classes} gestures")
filename = f'imgs/{2 if PLOT_2D else 3}_pca_{n_classes}_classes_{n_obs}_obs.pdf'
plt.savefig(filename)
print(f'Saved as {filename}')

Define a widget that, given an observation's index, will display the raw sensor measurements. This is
useful for identifying and removing outliers or bad observations.

In [None]:
@interact(idx='0')
def plot_from_index(idx='0'):
    if len(idx) == 0:
        return
    idx = int(idx)
    gesture_idx = idx_to_gesture[y[idx]]
    plot_raw_gesture(
        X[idx],
        f'{gesture_idx}: {gesture_info[gesture_idx]["description"]}\n{paths[idx]}'
    )
    print(f'{gesture_info[gesture_idx]["description"]}')
    print('rm ' + '/'.join(paths[idx].split('/')[3:]))