# Extracting gait features
Here we use the preprocessed accelerometer data as input, create time windows and extract relevant features for the subsequent task of gait detection. 

## Modules

In [50]:
# Automatically reload modules
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import pandas as pd
import tsdf

import dbpd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Constants

In [47]:
# tsdf files
subject = '0A0B82C94960D6DCABC1F597EC0BA657F4B0EDC320702BCEE3B6955CE924DE05'
sensor = 'IMU'
week_nr = '104'
segment_nr = '0001'

path_to_test_data = r'C:\Users\erik_\Documents\PhD\data\ppp\preprocessed\test_data'

input_path = os.path.join(path_to_test_data, '2.filtered', subject, sensor)
output_path = os.path.join(path_to_test_data, '3.windowed', subject, sensor)

base_filename = f'WatchData.{sensor}.Week{week_nr}.raw_segment{segment_nr}'
meta_filename = f'{base_filename}_meta.json'
values_filename = f'{base_filename}_samples.bin'
time_filename = f'{base_filename}_time.bin'
quality_filename = f'{base_filename}_quality.bin'

# windowing and feature engineering
window_type = 'hann'
verbose = 0

downsampled_frequency = 100 # Hz
window_length = 6 * downsampled_frequency # 6 seconds
window_step_size = 1 * downsampled_frequency # 1 second

# cepstral coefficients
low_frequency = 0 # Hz
high_frequency = int(downsampled_frequency / 2) # Hz
filter_length = high_frequency - 1 # here we assume a 1 Hz gait common frequency with equally-spaced harmonics
n_dct_filters = 16 # number of cepstral coefficients
filter_lenth = 16 # length of the filter

d_frequency_bandwidths = {
    'power_below_gait': [0.3, 0.7],
    'power_gait': [0.7, 3.5],
    'power_tremor': [3.5, 8],
    'power_above_tremor': [8, downsampled_frequency]
}

l_acceleration_cols = ['acceleration_x', 'acceleration_y', 'acceleration_z']
l_gravity_cols = ['grav_acceleration_x', 'grav_acceleration_y', 'grav_acceleration_z']
l_window_level_cols = ['id', 'window_nr', 'window_start', 'window_end']
l_data_point_level_cols = l_acceleration_cols + l_gravity_cols

# store data
l_channels = ([f'{x}_{y}' for x in l_gravity_cols for y in ['mean', 'std']] + 
              [f'{x}_{y}' for x in l_acceleration_cols for y in ['power_below_gait', 'power_gait', 'power_tremor', 'power_above_tremor']] + 
              ['std_norm_acc']
)

# Functions

## Load data

In [35]:
metadata_dict = tsdf.load_metadata_from_path(os.path.join(input_path, meta_filename))
metadata_time = metadata_dict[time_filename]
metadata_samples = metadata_dict[values_filename]

df = tsdf.load_dataframe_from_binaries([metadata_time, metadata_samples], tsdf.constants.ConcatenationType.columns)

df.head(2)

Unnamed: 0,time,rotation_x,rotation_y,rotation_z,acceleration_x,acceleration_y,acceleration_z,grav_acceleration_x,grav_acceleration_y,grav_acceleration_z
0,0.0,-115.670732,-32.012195,26.097561,-4.15947e-08,4.336545e-08,-2.067087e-08,-5.271109,5.495509,-2.619526
1,0.01,-110.636301,-34.62471,24.701537,-3.711922e-07,3.918652e-07,-1.835777e-07,-4.870643,5.69525,-2.307776


## Preprocess data

In [36]:
ppp = dbpd.PreprocessingPipeline(df, 'time', 100, 100, 0)
ppp.df_windows = ppp.tabulate_windows(window_step_size, window_length, l_data_point_level_cols)

print(ppp.df_windows.shape)

(725, 9)


In [37]:
# extract mean and std of gravity acceleration
for col in [f'grav_acceleration_{x}' for x in ['x', 'y', 'z']]:
    for stat in ['mean', 'std']:
        ppp.df_windows[f'{col}_{stat}'] = ppp.generate_statistics(col, stat)

In [38]:
# extract standard deviation of the Euclidean norm of the three axes
ppp.df_windows['std_norm_acc'] = ppp.generate_std_norm([f'acceleration_{x}' for x in ['x', 'y', 'z']])

In [39]:
for col in ['acceleration_x', 'acceleration_y', 'acceleration_z']:

    # fast fourier transforms
    ppp.df_windows[f'{col}_freqs'], ppp.df_windows[f'{col}_fft'] = ppp.signal_to_ffts(window_type=window_type, sensor_col=col)

    # compute power in distinct frequency bandwidths
    for bandwidth in d_frequency_bandwidths.keys():
        ppp.df_windows[col+'_'+bandwidth] = ppp.df_windows.apply(lambda x: ppp.compute_power_in_bandwidth(
            sensor_col=x[col],
            window_type=window_type,
            fmin=d_frequency_bandwidths[bandwidth][0],
            fmax=d_frequency_bandwidths[bandwidth][1]
            ), axis=1
        )

    # extract dominant frequency
    ppp.df_windows[col+'_dominant_frequency'] = ppp.df_windows.apply(lambda x: ppp.get_dominant_frequency(
        signal_ffts=x[col+'_fft'], signal_freqs=x[col+'_freqs'],
        fmin=d_frequency_bandwidths[bandwidth][0],
        fmax=d_frequency_bandwidths[bandwidth][1]
        ), axis=1
    )

In [40]:
for bandwidth in d_frequency_bandwidths.keys():
    ppp.df_windows['total_acc_'+bandwidth] = ppp.df_windows.apply(lambda x: sum(x[y+'_'+bandwidth] for y in l_acceleration_cols), axis=1)

ppp.df_windows['total_accel_power'] = ppp.compute_power(fft_cols=[f'{col}_fft' for col in l_acceleration_cols])

cc_cols = ppp.generate_cepstral_coefficients(window_length=window_length, total_power_col='total_accel_power',
                                            low_frequency=low_frequency, high_frequency=high_frequency,
                                            filter_length=filter_length, n_dct_filters=n_dct_filters)

ppp.df_windows = pd.concat([ppp.df_windows, cc_cols], axis=1)      

In [42]:
ppp.df_windows = ppp.df_windows.rename(columns={f'cc_{cc_nr}': f'cc_{cc_nr}_acc' for cc_nr in range(1,17)})

ppp.df_windows = ppp.df_windows.drop(columns=[f'{col}{x}' for x in ['', '_freqs', '_fft'] for col in l_acceleration_cols] + ['total_accel_power'] + l_gravity_cols)

In [45]:
ppp.df_windows.head(2)

Unnamed: 0,window_nr,window_start,window_end,grav_acceleration_x_mean,grav_acceleration_x_std,grav_acceleration_y_mean,grav_acceleration_y_std,grav_acceleration_z_mean,grav_acceleration_z_std,std_norm_acc,...,cc_7_acc,cc_8_acc,cc_9_acc,cc_10_acc,cc_11_acc,cc_12_acc,cc_13_acc,cc_14_acc,cc_15_acc,cc_16_acc
0,1,0,599,-0.046566,1.314864,0.050722,2.365911,0.058997,2.398061,3.419243,...,20.145499,25.941516,16.447771,20.761303,14.10855,17.410565,12.438634,15.023873,11.14431,13.202205
1,2,100,699,-0.099502,1.07443,0.091392,2.10448,-0.033973,2.285541,1.922233,...,20.397449,26.176959,16.682316,20.980846,14.324956,17.612769,12.6364,15.208219,11.323629,13.368846


# Store data

In [58]:
# drop window indicators
ppp.df_windows = ppp.df_windows.drop(columns=['window_nr', 'window_end']).rename(columns={'window_start': 'time'})

In [76]:
# change channels and units accordingly
metadata_samples.__setattr__('channels', l_channels)
metadata_samples.__setattr__('units', list(np.repeat('m/s^2', len(l_channels))))

# change output path
metadata_samples.__setattr__('file_dir_path', output_path)  
metadata_time.__setattr__('file_dir_path', output_path)

In [78]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

# store binaries and metadata
tsdf.write_dataframe_to_binaries(output_path, ppp.df_windows, [metadata_time, metadata_samples])
tsdf.write_metadata([metadata_time, metadata_samples], meta_filename)