# Testing BRF Model trained on CoEDC dataset using Bach's features, MOTUS categories, impure windows, optimise hyperparameters with 5-fold CV, and LOSO validation on best model. 


Setting parameters for notebook so can easily change as right at top of notebook

In [None]:
# size of windows in seconds
window_size = 5

# pure windows or majority?
PURE_WINDOWS = False

# accelerometer to be analysed; valid values are 'acg', 'axivity' and 'sens'
accelerometer = 'sens'

# any participants to exclude. Note that if processed_data_dir exists, then this will be ignored
PARTICIPANTS_TO_EXCLUDE = []

values_to_drop_before = ['Unknown']
values_to_drop_after = ['Other']

TEST_DATA_DIR = 'src/dc_data/test'

file_prefix = 'BRF_Bach_MOTUS'

Import all the libraries

In [None]:
%reload_ext autoreload
%autoreload 2

import os
import sys
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import load
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix, classification_report

home_directory = os.path.expanduser("~")
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))

sys.path.append(parent_dir)
test_data_dir = os.path.join(home_directory, TEST_DATA_DIR)
map_dir = parent_dir

import utils
import plot
import cf_matrix

pd.options.display.max_rows = 999
pd.options.display.max_colwidth = None

# For reproducibility
np.random.seed(42)

## Load test data

In [None]:
def load_all_and_make_windows(datafiles):
    # Function which given a list of datafiles, loads the data and makes windows for each and concatenates and returns

    def worker(datafile):
        print("\nProcessing", datafile)
        data = utils.load_data(datafile, acc_prefix = accelerometer)
        data = utils.map_to_new_classes(data, 'annotation', os.path.join(map_dir, 'motus_class_map.json'), verbose=True)
        data = data[~data['annotation'].isin(values_to_drop_before)]
        X, Y, T = utils.make_windows(data, winsec=window_size, sample_rate=30, dropna=False, verbose=True, drop_impure=PURE_WINDOWS)
        mask = ~np.isin(Y, values_to_drop_after)
        X, Y, T = X[mask], Y[mask], T[mask]
        print(f'After dropping {values_to_drop_after}, there are {len(X)} windows left')
        pid = os.path.basename(datafile).split(".")[0]  # participant ID
        pid = np.asarray([pid] * len(X))
        return X, Y, T, pid

    results = []
    for datafile in tqdm(datafiles):
        if os.path.basename(datafile) in test_acc_missing:
            print("\nSkipping", datafile)
            continue
        result = worker(datafile)
        results.append(result)

    X = np.concatenate([result[0] for result in results])
    Y = np.concatenate([result[1] for result in results])
    T = np.concatenate([result[2] for result in results])
    pid = np.concatenate([result[3] for result in results])

    return X, Y, T, pid

In [None]:
TEST_PARTICIPANTS_TO_EXCLUDE = []

if (accelerometer == 'acg'):
    TEST_ACC_MISSING = [26, 27, 37]
elif (accelerometer == 'axivity'):
    raise Exception("Axivity data not tested yet")
elif (accelerometer == 'sens'):
    TEST_ACC_MISSING = []

test_acc_missing = [f'P{i:02d}.csv.gz' for i in TEST_ACC_MISSING]
test_acc_missing.extend([f'P{i:02d}.csv.gz' for i in TEST_PARTICIPANTS_TO_EXCLUDE])

# check if test data directory exists
if not os.path.exists(test_data_dir):
    # raise an error
    raise Exception("Test data directory does not exist. Please create it and download the test data.")

test_data_files =  os.path.join(test_data_dir, 'P[0-9][0-9].csv.gz')

X_test, Y_test, T_test, pid_test = load_all_and_make_windows(sorted(glob(test_data_files)))

## Print label distribution and calculate features

In [None]:
print('\nLabel distribution (# windows)')
print(pd.Series(Y_test).value_counts())

import bach_features
X_test_feats = pd.DataFrame([bach_features.bach_features(x, sample_rate=30) for x in tqdm(X_test)])
print(f"X_test_feats shape: {X_test_feats.shape}")

# convert X_test_feats to numpy array in preparation for classification
X_test_feats = np.asarray(X_test_feats)

## Read in saved model

In [None]:
(final_model, labels) = load(os.path.join(current_dir, f'{file_prefix}_final_model.pkl'))
n_labels = len(labels)

## Run model on test data

In [None]:
test_results_dir = 'results'

if not os.path.exists(test_results_dir):
    os.makedirs(test_results_dir)

Y_test_pred = final_model.predict(X_test_feats)

print('\nPer participant classification report')
print(utils.per_participant_metrics(Y_test, Y_test_pred, pid_test))

print('\nClassifier performance on our test data')
print(classification_report(Y_test, Y_test_pred))

cm_test = confusion_matrix(Y_test, Y_test_pred, labels=labels)
cm_name = f'BRF_{"pure" if PURE_WINDOWS else "impure"}_{window_size}s_cm_test.csv'
pd.DataFrame(cm_test, index=labels, columns=labels).to_csv(os.path.join(test_results_dir, cm_name))
cf_matrix.make_confusion_matrix(cm_test, sum_stats=True, categories=labels, figsize=(n_labels+1,n_labels))

## Rerun model but on pure test data

In [None]:
# pure windows or majority?
PURE_WINDOWS = True
X_test, Y_test, T_test, pid_test = load_all_and_make_windows(sorted(glob(test_data_files)))

X_test_feats = pd.DataFrame([bach_features.bach_features(x, sample_rate=30) for x in tqdm(X_test)])
print(f"X_test_feats shape: {X_test_feats.shape}")

# convert X_test_feats to numpy array in preparation for classification
X_test_feats = np.asarray(X_test_feats)

Y_test_pred = final_model.predict(X_test_feats)

print('\nPer participant classification report')
print(utils.per_participant_metrics(Y_test, Y_test_pred, pid_test))

print('\nClassifier performance on our test data')
print(classification_report(Y_test, Y_test_pred))

cm_test = confusion_matrix(Y_test, Y_test_pred, labels=labels)
cm_name = f'BRF_{"pure" if PURE_WINDOWS else "impure"}_{window_size}s_cm_test.csv'
pd.DataFrame(cm_test, index=labels, columns=labels).to_csv(os.path.join(test_results_dir, cm_name))
cf_matrix.make_confusion_matrix(cm_test, sum_stats=True, categories=labels, figsize=(n_labels+1,n_labels))