# Preprocessing gait detection
This script preprocesses the sensor data, tabulates it into windows, and generates features. 

Execution time $\approx$ 50s per participant (amounting to 35 minutes in total).

### Modules

In [1]:
from pdathome.constants import global_constants as gc
# from pdathome.preprocessing import preprocess_gait_detection

### Process data

In [78]:
import os
import pandas as pd
import numpy as np

from collections import Counter

from paradigma.preprocessing_config import IMUPreprocessingConfig
from paradigma.gait.gait_analysis_config import GaitFeatureExtractionConfig
from paradigma.gait.feature_extraction import extract_temporal_domain_features, extract_spectral_domain_features
from paradigma.imu_preprocessing import butterworth_filter
from paradigma.segmenting import tabulate_windows

def compute_mode(data):
    """Computes the mode for 1D data using np.unique."""
    values, counts = np.unique(data, return_counts=True)
    max_count_index = np.argmax(counts)
    return values[max_count_index], counts[max_count_index]

def is_majority(data, target="Walking"):
    """Checks if 'target' occurs more than half the time in 1D data."""
    values, counts = np.unique(data, return_counts=True)
    target_count = counts[values == target].sum() if target in values else 0
    return target_count > (len(data) / 2)

for subject in gc.participant_ids.L_PD_IDS:
    print(f'Processing {subject}...')
    for side in [gc.descriptives.MOST_AFFECTED_SIDE, gc.descriptives.LEAST_AFFECTED_SIDE]:
        df = pd.read_pickle(os.path.join(gc.paths.PATH_PREPARED_DATA, f'{subject}_{side}.pkl'))

        config = IMUPreprocessingConfig()
        config.acceleration_units = 'g'

        # Extract relevant gc.columns for accelerometer data
        accel_cols = ['accelerometer_x', 'accelerometer_y', 'accelerometer_z']

        # Change to correct units [g]
        df[accel_cols] = df[accel_cols] / 9.81 if config.acceleration_units == 'm/s^2' else df[accel_cols]

        # Extract accelerometer data
        accel_data = df[accel_cols].values

        filter_configs = {
            "hp": {"result_columns": accel_cols, "replace_original": True},
            "lp": {"result_columns": [f'{col}_grav' for col in accel_cols], "replace_original": False},
        }

        # Apply filters in a loop
        for passband, filter_config in filter_configs.items():
            filtered_data = butterworth_filter(
                data=accel_data,
                order=config.filter_order,
                cutoff_frequency=config.lower_cutoff_frequency,
                passband=passband,
                sampling_frequency=config.sampling_frequency,
            )

            # Replace or add new columns based on configuration
            df[filter_config["result_columns"]] = filtered_data

        config = GaitFeatureExtractionConfig()

        windowed_data = []

        l_windowed_cols = [
            gc.columns.TIME, gc.columns.FREE_LIVING_LABEL
            ] + config.l_accelerometer_cols + config.l_gravity_cols
        
        if subject in gc.participant_ids.L_PD_IDS:
            l_windowed_cols += [gc.columns.ARM_LABEL]

        if subject in gc.participant_ids.L_PD_IDS:
            df_grouped = df.groupby(gc.columns.PRE_OR_POST, sort=False)
            order = ['pre', 'post']

            for label in order:
                if label in df_grouped.groups:  # Ensure the label exists in the groups
                    group = df_grouped.get_group(label)
                    windows = tabulate_windows(
                        config=config,
                        df=group,
                        columns=l_windowed_cols
                    )
                    if len(windows) > 0:  # Skip if no windows are created
                        windowed_data.append(windows)

        else:
            windows = tabulate_windows(
                config=config,
                df=df,
                columns=l_windowed_cols
            )
            if len(windows) > 0:  # Skip if no windows are created
                windowed_data.append(windows)

        if len(windowed_data) > 0:
            windowed_data = np.concatenate(windowed_data, axis=0)
        else:
            raise ValueError("No windows were created from the given data.")
        
        df_features = pd.DataFrame()

        df_features[gc.columns.TIME] = sorted(windowed_data[:, 0, l_windowed_cols.index(gc.columns.TIME)])

        if subject in gc.participant_ids.L_PD_IDS:
            df_features = pd.merge(left=df_features, right=df[[gc.columns.TIME, gc.columns.PRE_OR_POST]], how='left', on=gc.columns.TIME) 

        # Calulate the mode of the labels
        windowed_labels = windowed_data[:, :, l_windowed_cols.index(gc.columns.FREE_LIVING_LABEL)]
        modes_and_counts = np.apply_along_axis(lambda x: compute_mode(x), axis=1, arr=windowed_labels)
        modes, counts = zip(*modes_and_counts)

        df_features[gc.columns.ACTIVITY_LABEL_MAJORITY_VOTING] = modes
        df_features[gc.columns.GAIT_MAJORITY_VOTING] = [is_majority(window) for window in windowed_labels]

        if subject in gc.participant_ids.L_PD_IDS:
            windowed_labels = windowed_data[:, :, l_windowed_cols.index(gc.columns.ARM_LABEL)]
            modes_and_counts = np.apply_along_axis(lambda x: compute_mode(x), axis=1, arr=windowed_labels)
            modes, counts = zip(*modes_and_counts)

            df_features[gc.columns.ARM_LABEL_MAJORITY_VOTING] = modes
            df_features[gc.columns.NO_OTHER_ARM_ACTIVITY_MAJORITY_VOTING] = [is_majority(window, target="Gait without other behaviours or other positions") for window in windowed_labels]

        # compute statistics of the temporal domain signals
        accel_indices = [l_windowed_cols.index(x) for x in config.l_accelerometer_cols]
        grav_indices = [l_windowed_cols.index(x) for x in config.l_gravity_cols]

        accel_windowed = np.asarray(windowed_data[:, :, np.min(accel_indices):np.max(accel_indices) + 1], dtype=float)
        grav_windowed = np.asarray(windowed_data[:, :, np.min(grav_indices):np.max(grav_indices) + 1], dtype=float)

        df_temporal_features = extract_temporal_domain_features(
            config=config,
            windowed_acc=accel_windowed,
            windowed_grav=grav_windowed,
            l_grav_stats=['mean', 'std']
        )

        df_features = pd.concat([df_features, df_temporal_features], axis=1)

        # transform the signals from the temporal domain to the spectral domain using the fast fourier transform
        # and extract spectral features
        df_spectral_features = extract_spectral_domain_features(
            config=config,
            sensor=config.sensor,
            windowed_data=accel_windowed,
        )

        df_features = pd.concat([df_features, df_spectral_features], axis=1)
        
        file_path = os.path.join(gc.paths.PATH_GAIT_FEATURES, f'{subject}_{side}.pkl')
        df_features.to_pickle(file_path)


Processing hbv002...
Processing hbv012...
Processing hbv014...
Processing hbv015...
Processing hbv016...
Processing hbv017...
Processing hbv022...
Processing hbv024...
Processing hbv039...
Processing hbv043...
Processing hbv047...
Processing hbv054...
Processing hbv065...
Processing hbv077...
Processing hbv079...
Processing hbv090...
Processing hbv013...
Processing hbv018...
Processing hbv023...
Processing hbv038...
Processing hbv058...
Processing hbv063...


In [80]:
df_features[[x for x in df_features.columns if 'norm' in x]]

Unnamed: 0,std_norm
0,0.029423
1,0.006257
2,0.005415
3,0.003488
4,0.002488
...,...
8140,0.022984
8141,0.045723
8142,0.064985
8143,0.081866
