# Create models from *Ergo* data
This notebook contains the code for models used to predict the *Ergo* data. See the report [here](https://git.cs.sun.ac.za/Computer-Science/rw771/2022/26723077-TG7-doc) or the source code behind the data [here](https://git.cs.sun.ac.za/Computer-Science/rw771/2022/26723077-TG7-src).

### Items to explore

- Compare different model types: Random Forest, SVM, NN, Nïeve Bayes, Quadratic Discriminent Analysis
- Note: The model might struggle with how gesture classification is independant of when the gesture was performed. How to modify training data to account for this? and how to verify that models aren't fixating on *when* a gesture happens, as opposed to *which* gestures happens?
- Note: It's possible that gestures recorded sequentially are unnaturally similar to each other. Maybe rather try record 60 observations for gesture X, then 60 for gesture Y, and then 60 for gesture X again to get more variety.

- TODO: create a common method of scaling with a saved scaler before plotting the observation
- TODO: create a method of visualising how data flows through the MLP so you can troubleshoot misclassified real-time predictions
- TODO: indicate for a given model which observations were correctly predicted
- TODO: this should actually show the model activations for the actual and predicted gestures, and not just some random (maybe unrepresentative) example observation


### The overall process
0. Imports and constants, reading in data, scaling data
1. Find outliers via PCA
2. Remove outliers
3. Train many model on the full dimensionality dataset, and save them all
4. Evaluate ONE saved model
    - Confusion matrix
    - Plots of all incorrectly classified observations
    - Visualise important features
5. Self-classify some observations in real time
6. Audit how well the model predicts those real-time observations

## 0.1 Imports and constants

In [None]:
%load_ext autoreload
%autoreload 2

from common_utils import *

from time import time
from matplotlib import cm

# Preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

# Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Distributions
from sklearn.utils.fixes import loguniform

## 0.2 Read in the data to an `np.array`

In [None]:
X, y, paths = read_to_numpy(min_obs=0)
n_classes = np.unique(y).shape[0]
n_obs = y.shape[0]
gesture_info = get_gesture_info()
with open('saved_models/idx_to_gesture.pickle', 'rb') as f:
    idx_to_gesture = pickle.load(f)
with open('saved_models/gesture_to_idx.pickle', 'rb') as f:
    gesture_to_idx = pickle.load(f)


## 0.3 Train-test split and scale the data

In [None]:
X_train, X_test, y_train, y_test, paths_train, paths_test = train_test_split_scale(X, y, paths)
scaler = load_model("saved_models/StandardScaler().pickle")
print(f'{X_train.shape=}\n{y_train.shape=}\n\n{X_test.shape=}\n{y_test.shape=}')

# 1. Find any outliers via PCA or t-SNE

In [None]:
# PCA can be either 2D or 3D
PLOT_2D = True
INCL_LABELS = False
SCALE_DATA = False
DIM_REDUCT = ['tsne', 'pca'][1]
from sklearn.manifold import TSNE

# Read in all the unknown data
files = os.listdir('../gesture_data/self-classified/unknown/')
obs_idx = 0
# Transform the data via PCA. Either 2 or 3 components are used
if DIM_REDUCT == 'pca':
    dim_red = PCA(n_components=(2 if PLOT_2D else 3))
elif DIM_REDUCT == 'tsne':
    dim_red = TSNE(n_components=(2 if PLOT_2D else 3), random_state=42)

if SCALE_DATA:
    X_r = dim_red.fit_transform(scaler.transform(X))
#     if INCL_UNKNOWN:
#         unknown_X_r = dim_red.transform(scaler.transform(unknown_X))
else:
    X_r = dim_red.fit_transform(X)
#     if INCL_UNKNOWN:
#         unknown_X_r = dim_red.transform(unknown_X)


# Each observation gets a different colour on the scatter plot, and
# similar colours get different markers to better differentiate them
colours = cm.get_cmap('turbo', n_classes)
markers = ['.', 'x', '*', 'd', 's']


if PLOT_2D:
    # Use 2D subplots
    fig, ax = plt.subplots(figsize=(12,8))
else:
    # Use 3D subplots
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')

# Optionally also plot the observation indices along with the points.
# This helps when removing outliers, but increases the amount of clutter
if PLOT_2D and INCL_LABELS:
    for i, yi in enumerate(y):
#         gesture_idx = idx_to_gesture[yi].replace('gesture0', 'g')
#         if gesture_idx == 'g255':
#             # We won't bother plotting the 'no movement' gesture as it's very general
#             continue
        ax.annotate(
            i, 
            (X_r[i, 0], X_r[i, 1]),
            color=colours(yi/n_classes),
            size=5,
            alpha=0.5,
        )

# Iterate over each label/gesture
for i, label_idx in enumerate(idx_to_gesture.keys()):
    # Args either has 2 items (if 2D) or 3 (if 3D)
    args = [
        X_r[y == label_idx, 0], 
        X_r[y == label_idx, 1],
    ]
    if not PLOT_2D:
        args.append(X_r[y == label_idx, 2])
    
    # Get a shortened version of the gesture index for the legend
    gesture_idx = idx_to_gesture[label_idx].replace('gesture0', 'g')
#     if gesture_idx == 'g255':
#         # We won't bother plotting the 'no movement' gesture as it's very general
#         continue
    # Get the short gesture description for the legend
    gesture_desc = gesture_info[idx_to_gesture[label_idx]]["desc"]
    
    # Actually plot the points, either in 2 or 3 dimensions
    ax.scatter(
        *args,
        color=colours(label_idx/n_classes),
        alpha=0.3 if INCL_LABELS else 1.0,
        s=10 if gesture_desc != 'unknown' else 30,
        marker=markers[label_idx % len(markers)],
        label=f'{gesture_idx} ({gesture_desc})'
    )

# ----------------------------------------------------------------
#
#   modified from https://stackoverflow.com/a/4701285/14555505
#
# Shrink current axis's height by 10% on the bottom so the legend will fit
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.2,
                 box.width, box.height * 0.80])
# Put a legend below current axis in the newly made space
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=4)
# ----------------------------------------------------------------

# Give the plot a title and save it
plt.title(f"PCA with {'two' if PLOT_2D else 'three'} components over {n_classes} gestures")
filename = f'imgs/{2 if PLOT_2D else 3}_pca_{n_classes}_classes_{n_obs}_obs.pdf'
plt.savefig(filename)
print(f'Saved as {filename}')

# 2. Remove outliers
This widget allows you to plot an observation along with its original path and other metadata.
After finding an outlier's observation index on the PCA plot, you can graph the actual observation
here. If it is actually an outlier, this widget will also give you the path of the original observation, 
from which you can remove the original csv file defining the observation and thereby remove it from the
dataset.

All paths are relative to the `ergo/machine_learning/` directory.

In [None]:
@interact(idx='0')
def plot_from_index(idx='0'):
    if len(idx) == 0:
        return
    idx = int(idx)
    gesture_idx = idx_to_gesture[y[idx]]
    plot_raw_gesture(
        from_flat(scale_single(X[idx], scaler)),
        f'{gesture_idx}: {gesture_info[gesture_idx]["description"]}\n{paths[idx].split("/")[-1]}',
#         show_values=True,
    )
    print(f'{gesture_info[gesture_idx]["description"]}')
    print('rm ' + paths[idx])

# 3.0 Train the MLP and save it


In [None]:
%%time
models = []

models.append(
    (MLPClassifier(max_iter=1000), {
        'hidden_layer_sizes': [(50), (100), (200)],
#         'hidden_layer_sizes': [(100), (200), (400), (100, 50), (200, 100), (400, 200), 
#                                (100, 50, 25), (200, 100, 50), (400, 200, 100)],
        'activation' : ['logistic', 'tanh', 'relu'],
#         'solver' : ['lbfgs', 'adam'],
        'alpha': loguniform(1e-6, 1e-2),
    })
)

clfs = []
for model, param_grid in models:
    print(f'\nTraining {model}')
    start = time()
    clf = RandomizedSearchCV(
        model, param_grid, n_iter=1
    )
    clf = clf.fit(X_train, y_train)
    print(f'- Time taken: {time() - start:.3f}s\n- Best performing model\n`{clf.best_estimator_}`\n- Score: train: {clf.best_score_:.4f}, test: {clf.score(X_test, y_test):.4f}')
    clfs.append(clf.best_estimator_)
    save_model(clf.best_estimator_)
    

# 4.0 Evaluate a single saved model

- List of mislabelled gestures
- Confusion matrix
- Visualise important features
- Plots of all incorrectly classified observations


In [None]:
model_paths = ['saved_models/' + p for p in os.listdir('saved_models') if 'Classifier' in p]
print('\n'.join(model_paths))
clfs = [load_model(model_paths[0])]
clf = clfs[0]
clf

## 4.1 Get a list of mislabeled gestures
Get a `pd.DataFrame` with counts of the most often mislabeled gestures

In [None]:
conf_mat = pd.DataFrame(confusion_matrix(y_test, clf.predict(X_test)))
np.fill_diagonal(conf_mat.values, 0)
conf_mat.index = gesture_to_idx.keys()
conf_mat.columns = gesture_to_idx.keys()
mislabeled = conf_mat.stack()
mislabeled = mislabeled[mislabeled > 0].reset_index()
mislabeled.columns = ['true', 'predicted', 'count']
mislabeled = mislabeled.sort_values(
    ['count', 'true', 'predicted'], 
    ascending=[False, True, True]
)
mislabeled

## 4.2 Confusion matrix of the model: plot and save

In [None]:
clf = clfs[0]
y_pred = clf.predict(X_test)
clf_name = f'{str(type(clf))}'.split('.')[-1][:-2]

fig, ax = plt.subplots(figsize=(12,12))
ConfusionMatrixDisplay.from_estimator(
    clf, 
    X_test,
    y_test, 
    display_labels=gesture_to_idx.keys(), 
    xticks_rotation="vertical",
    ax=ax,
)
ax.grid(False)
plt.title(f'Confusion Matrix of \n{clf}')
plt.tight_layout()

plt.savefig(f'imgs/conf_mat_{clf}.pdf')

## 4.3 Visualise important features

In [None]:
num_cols = 5
fig, axes = plt.subplots(int(np.ceil(n_classes/num_cols)), num_cols)
# use global min / max to ensure all weights are shown on the same scale

vmin, vmax = clf.coefs_[0].min(), clf.coefs_[0].max()
importances_mat = np.zeros((n_classes, n_timesteps, n_sensors))
for gesture_idx, ax in enumerate(axes.ravel()):
    if gesture_idx >= n_classes:
        ax.set_xticks(())
        ax.set_yticks(())
        ax.grid(False)
        continue
        
    multiplied = clf.coefs_[0]
    for layer in range(1, len(clf.coefs_)):
        multiplied = multiplied @ clf.coefs_[layer]

    importances = multiplied[:, gesture_idx].reshape(n_timesteps, n_sensors)
    importances_mat[gesture_idx] = importances
    
    gesture_label = idx_to_gesture[gesture_idx]
    gesture_description = gesture_info[gesture_label]['description']
    plot_raw_gesture(
        importances.reshape(n_timesteps, n_sensors),
        f'{gesture_label}\n{gesture_description}',
        ax=ax,
        show_cbar=False,
        show_xticks=False,
        show_yticks=False,
        delim_lw=1,
    )

plt.suptitle('Importances per gesture for the trained MLP')
plt.tight_layout()
plt.savefig(f'imgs/importances_{clf}.pdf')

## 4.4 Plot all incorrectly labelled observations

In [None]:
y_pred = clf.predict(X_test)

X_test_incorrect = X_test[y_pred != y_test]
y_pred_incorrect = y_pred[y_pred != y_test]
y_test_incorrect = y_test[y_pred != y_test]
paths_test_incorrect = paths_test[y_pred != y_test]


@interact(idx=(0, max(1, y_pred_incorrect.shape[0]-1), 1))
def plot_incorrect(idx=0):
    if y_pred_incorrect.shape[0] == 0:
        print('All gestures were correctly predicted')
        return
    predicted = idx_to_gesture[y_pred_incorrect[idx]]
    pred_desc = gesture_info[predicted]['description']
    
    actual = idx_to_gesture[y_test_incorrect[idx]]
    actu_desc = gesture_info[actual]['description']
    
    path = '/'.join(paths_test_incorrect[idx].split('/')[3:])
    
    # Create 3 horizontal axs:
    # - left is an example of the actual gesture, 
    # - middle is the incorrectly predicted gesture,
    # - right is an example of the predicted gesture
    fig, axs = plt.subplots(1, 3)
    
    # First plot an example of the actual gesture
    actual_idx = next(i for i, yi in enumerate(y_train) if yi == y_test_incorrect[idx])
    act_gesture_label = idx_to_gesture[y_train[actual_idx]]
    act_gesture_description = gesture_info[idx_to_gesture[y_train[actual_idx]]]["description"]
    plot_raw_gesture(
        importances_mat[int(y_test_incorrect[idx])], 
        f'Actual {act_gesture_label}\n{act_gesture_description}',
        ax=axs[0],
        show_xticks=False,
        show_cbar=False,
    )

    # Second plot the misclassified gesture
    plot_raw_gesture(
        from_flat(X_test_incorrect[idx]), 
        f'Mislabelled\n{paths_test_incorrect[idx].split("/")[-1]}',
        ax=axs[1],
        show_yticks=False,
        show_xticks=False,
        show_cbar=False,
    )
    
    # Last plot an example of the predicted gesture
    predicted_idx = next(i for i, yi in enumerate(y_train) if yi == y_pred_incorrect[idx])
    pred_gesture_label = idx_to_gesture[y_train[predicted_idx]]
    pred_gesture_description = gesture_info[idx_to_gesture[y_train[predicted_idx]]]["description"]
    plot_raw_gesture(
#         X_train[predicted_idx], 
        importances_mat[int(y_pred_incorrect[idx])], 
        f'Predicted {pred_gesture_label}\n{pred_gesture_description}',
        ax=axs[2],
        show_yticks=False,
        show_xticks=False,
        show_cbar=False,
    )
    
    # Finally, tell matplotlib to recompute the layout
    plt.tight_layout()
    print(paths_test_incorrect[idx])
    name = path.split(os.sep)[-1]
    filename = f'imgs/misclassified_predicted_{pred_gesture_label}_actually_{act_gesture_label}_file_{name}.pdf'
    plt.savefig(filename)


In [None]:
SCALED = False
path = '../gesture_data/train/gesture0000/2022-07-11T15:28:53.591713.csv'
obs = read_to_ndarray(path)

plot_raw_gesture(
    scale_single(obs, scaler) if SCALED else obs,
    title=f'{path}',
#     show_values=True
)
name = path.split(os.sep)[-1]
filename = f'imgs/visualise_file_{name}.pdf'
plt.savefig(filename)


## 5.0 Evaluate trained model based on its real-time predictions

In [None]:
path = '../gesture_data/train/gesture0009/2022-06-29T22:02:57.509842+02:00.txt'
obs = read_to_ndarray(path)
print(f"Reading from {path}:\n{scale_single(obs, scaler)[:5, :5]}")
print(obs.shape)
predictions = predict_nicely(df, clf, scaler, idx_to_gesture)
for gesture_idx, proba in predictions:
    if proba < 0.0001:
        break
    print(f'{gesture_idx}: {proba*100:.2f}%')

plot_raw_gesture(scale_single(obs, scaler), f'{path}', show_values=True)

## Plot unknown self-classified observations

In [None]:
uk_paths = [
    '../gesture_data/self-classified/unknown/' + f 
    for f in sorted(os.listdir('../gesture_data/self-classified/unknown/'))
    if '.txt' in f
]

@interact(idx=(1, len(uk_paths)-1, 1))
def plot_from_index2(idx=0):
    path = uk_paths[idx]
    df = read_to_df(path)
    obs = df.to_numpy()
    print(f"Reading from {path}")
    predictions = predict_nicely(obs, clf, scaler, idx_to_gesture)
    for gesture_idx, proba in predictions:
        print(f'{gesture_idx}: {proba*100:.2f}%')
    
    plot_raw_gesture(scale_single(obs, scaler), f'{idx=}\n{path=}')

In [None]:
dir_files_sc = get_dir_files('../gesture_data/self-classified')
gesture = list(dir_files_sc.keys())[0]
file = dir_files_sc[gesture][0]
path = f'../gesture_data/self-classified/{gesture}/{file}'
path = '../gesture_data/self-classified/gesture0007/2022-07-02T17:35:57.260752.txt'
df = read_to_df(path)
print(f"Reading from {path}:\n{df.to_numpy()[:5, :5]}")

predictions = predict_nicely(df, clf, scaler, idx_to_gesture)
for gesture_idx, proba in predictions:
    if proba < 0.0001:
        break
    print(f'{gesture_idx}: {proba*100:.2f}%')

plot_raw_gesture(df, f'{gesture}\n{path}')