# Modules

In [1]:
import numpy as np
import os
import pandas as pd
import sys

# if you are unable to load pdathome.constants, you need to add the path to the src folder to the system path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from pdathome.classification import cv_train_test_model, windows_to_timestamps, store_model
from pdathome.constants import classifiers, columns, participant_ids, paths
from pdathome.load import load_dataframes_directory

from paradigma.gait_analysis_config import GaitFeatureExtractionConfig

# Constants

In [2]:
classifier = classifiers.RANDOM_FOREST
target_colname = columns.GAIT_MAJORITY_VOTING

# Process data

In [4]:
# Initialize configuration
config = GaitFeatureExtractionConfig()

# Define predictors
l_predictors = list(config.d_channels_values.keys())
l_predictors_scale = [x for x in l_predictors if 'dominant' not in x]

# Load data
df_all_subjects = load_dataframes_directory(
    directory_path=paths.PATH_GAIT_FEATURES,
    l_ids=participant_ids.L_PD_IDS + participant_ids.L_HC_IDS
)

# Lists to store results
l_thresholds = []
l_importances = []

# Iterate over subjects and process data
for subject in participant_ids.L_PD_IDS + participant_ids.L_HC_IDS:
    print(f"Processing subject {subject}")
    df_subject = df_all_subjects[df_all_subjects[columns.ID] == subject]

    # Train and test model
    df_test, classification_threshold, importances = cv_train_test_model(
        subject=subject,
        df=df_all_subjects,
        model=classifier,
        l_predictors=l_predictors,
        l_predictors_scale=l_predictors_scale,
        target_column_name=target_colname, 
        pred_proba_colname=columns.PRED_GAIT_PROBA,
        pred_colname=columns.PRED_GAIT,
        step='gait'
    )
   
    # Collect thresholds and importances
    l_thresholds.append(classification_threshold)
    l_importances.append(importances)

    # Save predictions
    windows_to_timestamps(
        subject=subject, df=df_test,
        path_output=paths.PATH_GAIT_PREDICTIONS, 
        pred_proba_colname=columns.PRED_GAIT_PROBA,
        step='gait'
    )
    
# Save average threshold
mean_threshold = np.mean(l_thresholds)
with open(os.path.join(paths.PATH_THRESHOLDS, 'gait_threshold.txt'), 'w') as f:
    f.write(str(mean_threshold))

# Save importances
with open(os.path.join(paths.PATH_CLASSIFIERS, f'{classifier}_importances.txt'), 'w') as f:
    # Flatten the list of dictionaries and format them
    all_importances = pd.concat([pd.Series(imp) for imp in l_importances], axis=1).mean(axis=1)
    for feature, importance in all_importances.items():
        f.write(f'{feature}: {importance}\n')

store_model(
    df=df_all_subjects,
    model=classifier,
    l_predictors=l_predictors,
    l_predictors_scale=l_predictors_scale,
    target_column_name=target_colname,
    path_scalers=paths.PATH_SCALERS,
    path_classifiers=paths.PATH_CLASSIFIERS,
    step='gait'
)