In [None]:

from Classification_experiments.classification_experiments import *
from AQSM_SW1PerS.utils.paths import get_data_path

import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt


In [None]:
data_file = get_data_path("Periodicity_Scores", "exp1_PS/pose_exp1.csv")

binary_method = True
PS1_scores = False


# Experiment 1 - Startified Set Classification

Using stratified random split for train/test/val sets. This will give good ingisht into feature importance and look into what the model stuggles with. The following hyperparameters were obtained via Bayesian Optimization with all sensors being required. We do this because when also exploring optimal sensor combinations, it was determined that all sensors are necessary for best model.

In [None]:

if binary_method:
    class_names = ['None', 'SMM']
    if PS1_scores:
        best_params = {'n_estimators': 361, 'max_depth': 10, 'min_samples_split': 0.0001, 'min_samples_leaf': 0.0001, 'max_features': 'sqrt', 'class_weight': 'balanced_subsample', 'criterion': 'gini'}

        accel_params =  {'n_estimators': 125, 'max_depth': 10, 'min_samples_split': 0.5, 'min_samples_leaf': 0.1073308009028807, 'max_features': 'sqrt', 'class_weight': 'balanced_subsample', 'criterion': 'entropy'}

    else: 
        best_params =  {'n_estimators': 291, 'max_depth': 10, 'min_samples_split': 0.0001, 'min_samples_leaf': 0.0001, 'max_features': None, 'class_weight': 'balanced_subsample', 'criterion': 'entropy'}

        accel_params =  {'n_estimators': 116, 'max_depth': 96, 'min_samples_split': 0.0001, 'min_samples_leaf': 0.048160194934634915, 'max_features': None, 'class_weight': 'balanced_subsample', 'criterion': 'log_loss'}

else:
    class_names = ['None', 'Rock', 'Flap', 'Flap-Rock']
    if PS1_scores:
        best_params =  {'n_estimators': 69, 'max_depth': 10, 'min_samples_split': 0.07027628488911168, 'min_samples_leaf': 0.20093207445139724, 'max_features': 'log2', 'class_weight': 'balanced', 'criterion': 'log_loss'}

        accel_params =  {'n_estimators': 45, 'max_depth': 10, 'min_samples_split': 0.0001, 'min_samples_leaf': 0.0001, 'max_features': 'log2', 'class_weight': 'balanced', 'criterion': 'log_loss'}
    else:
        best_params = {'n_estimators': 500, 'max_depth': 11, 'min_samples_split': 0.0001, 'min_samples_leaf': 0.008792271867988076, 'max_features': 'sqrt', 'class_weight': 'balanced_subsample', 'criterion': 'log_loss'}

        accel_params = {'n_estimators': 99, 'max_depth': 200, 'min_samples_split': 0.0001, 'min_samples_leaf': 0.0001, 'max_features': 'log2', 'class_weight': 'balanced_subsample', 'criterion': 'gini'}
        

In [None]:

df = pd.read_csv(data_file) 

X_train, X_val, X_test, y_train, y_val, y_test = test_train_val_split(df, 120)

# Make absolute sure the only labels are {0,1,2,3}
train_mask = y_train != -1
test_mask = y_test != -1
val_mask = y_val != -1

X_train = X_train[train_mask]
y_train = y_train[train_mask]

X_test = X_test[test_mask]
y_test = y_test[test_mask]

X_val = X_val[val_mask]
y_val = y_val[val_mask]

num_total_features = len(X_train[0])

num_feature_groups_10 = int(num_total_features/10)

group_sizes = [10] * num_feature_groups_10

if PS1_scores:
    X_train = compress_features(X_train, group_sizes)
    X_test = compress_features(X_test, group_sizes)

X_train_resampled, y_train_resampled = train_class_oversampling(X_train, y_train, binary_method = binary_method)

if binary_method:
    y_train_resampled = (y_train_resampled != 0).astype(int)
    y_test = (y_test != 0).astype(int)
    y_val = (y_val != 0).astype(int)
    

In [None]:

model = RandomForestClassifier(**accel_params, n_jobs=-1, random_state=42)

model.fit(X_train_resampled, y_train_resampled)

test_predictions = model.predict(X_test)


In [None]:

plot_confusion_matrix(y_test, test_predictions, class_names, binary_method = binary_method, compressed = PS1_scores)

report = classification_report(y_test, test_predictions, target_names=class_names, digits=2)
print(report)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

feature_names = ['Head Pos', 'LW Pos', 'RW Pos', 'LS Pos', 'RS Pos', 'Chest Pos',
                 'Head Accel', 'LW Accel', 'RW Accel', 'LS Accel', 'RS Accel', 'Chest Accel']

# Compute importances
importances = model.feature_importances_

if not PS1_scores:
    n_features_per_sensor = 10
    sensor_importances = [
        np.sum(importances[i * n_features_per_sensor:(i + 1) * n_features_per_sensor])
        for i in range(len(feature_names))
    ]
    
    sorted_idx = np.argsort(sensor_importances)[::-1]
    sorted_importances = np.array(sensor_importances)[sorted_idx]
    sorted_names = np.array(feature_names)[sorted_idx]
else:
    sorted_idx = np.argsort(importances)[::-1]
    sorted_importances = np.array(importances)[sorted_idx]
    sorted_names = np.array(feature_names)[sorted_idx]

# Method label
method_name = fr'$PS_1$' if PS1_scores else fr'$PS_{{10}}$'

# --- Plotting ---
fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(range(len(sorted_importances)), sorted_importances, align="center")
ax.set_xticks(range(len(sorted_names)))
ax.set_xticklabels(sorted_names, rotation=90, fontsize=12)
ax.set_title(f"{method_name} Binary Model – Feature Importances", fontsize=14)
ax.tick_params(axis='y', labelsize=12)
ax.set_ylabel("Importance", fontsize=13)
ax.set_xlabel("Feature", fontsize=13)

plt.tight_layout()
plt.show()

plt.close()


In [None]:

plot_auc_curve(model, X_test, y_test, binary=binary_method)


In [None]:

plot_PR_curve(model, X_test, y_test, binary=binary_method, compressed = PS1_scores)


# Experiment 2 - Leave-One-Session-Out

To enable closer comparison with established baselines in the field, the LOCO method can be adapted to leave a single session out rather than the data of an entire child. 

In [None]:

leave_one_out_bayes(data_file, LOSO_method = 'Child', PS1 = True, binary_method = binary_method, plotCM = True,  plotPR = True)


# Experiment 3 - Leave-One-Child-Out

To test the performance of the TDA features on truly unseen data, Leave-One-Child-Out (LOCO) was performed where the model is tested on the data of one child while the remaining children’s data is used for training and validation with Bayesian optimization used to find the near-optimal hyperparameters for the Random Forest Classifier. This method better reflects real-world deployment scenarios where models are applied to completely unseen individuals.


In [None]:

leave_one_out_bayes(data_file, LOSO_method = 'Child', PS1 = True, binary_method = binary_method, plotCM = True, plotPR = True)
