# Extracting gait features
Here we use the preprocessed accelerometer data as input, create time windows and extract relevant features for the subsequent task of gait detection. 

## Modules

In [9]:
# Automatically reload modules
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import pandas as pd
import tsdf

from dbpd.extracting_features import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Constants

In [10]:
# tsdf files
sensor = 'acceleration'
path_to_test_data =  '../../../tests/data'

input_path = os.path.join(path_to_test_data, '2.preprocessed_data')
output_path = os.path.join(path_to_test_data, '3.extracted_features')

meta_filename = f'{sensor}_meta.json'
values_filename = f'{sensor}_samples.bin'
time_filename = f'{sensor}_time.bin'

# windowing and feature engineering
window_type = 'hann'
verbose = 0

sampling_frequency = 100 # Hz
window_length_s = 6
window_step_size_s = 1

# cepstral coefficients
low_frequency = 0 # Hz
high_frequency = int(sampling_frequency / 2) # Hz
filter_length = high_frequency - 1 # here we assume a 1 Hz gait common frequency with equally-spaced harmonics
n_dct_filters = 16 # number of cepstral coefficients
filter_lenth = 16 # length of the filter

d_frequency_bandwidths = {
    'power_below_gait': [0.3, 0.7],
    'power_gait': [0.7, 3.5],
    'power_tremor': [3.5, 8],
    'power_above_tremor': [8, sampling_frequency]
}

l_acceleration_cols = ['acceleration_x', 'acceleration_y', 'acceleration_z']
l_gravity_cols = ['grav_acceleration_x', 'grav_acceleration_y', 'grav_acceleration_z']
l_window_level_cols = ['id', 'window_nr', 'window_start', 'window_end']
l_data_point_level_cols = l_acceleration_cols + l_gravity_cols

# store data
l_channels = ([f'{x}_{y}' for x in l_gravity_cols for y in ['mean', 'std']] + 
              [f'{x}_{y}' for x in l_acceleration_cols for y in ['power_below_gait', 'power_gait', 'power_tremor', 'power_above_tremor']] + 
              ['std_norm_acc']
)

# Functions

## Load data

In [11]:
metadata_dict = tsdf.load_metadata_from_path(os.path.join(input_path, meta_filename))
metadata_time = metadata_dict[time_filename]
metadata_samples = metadata_dict[values_filename]

df = tsdf.load_dataframe_from_binaries([metadata_time, metadata_samples], tsdf.constants.ConcatenationType.columns)

df.head(2)

Unnamed: 0,time,acceleration_x,grav_acceleration_x,acceleration_y,grav_acceleration_y,acceleration_z,grav_acceleration_z
0,0.0,-4.15947e-08,-5.271109,4.336545e-08,5.495509,-2.067087e-08,-2.619526
1,0.01,-3.711922e-07,-4.870643,3.918652e-07,5.69525,-1.835777e-07,-2.307776


## Preprocess data

In [34]:
df_windowed = tabulate_windows(
    df=df,
    data_point_level_cols=l_data_point_level_cols,
    window_length_s=window_length_s,
    window_step_size_s=window_step_size_s,
    sampling_frequency=sampling_frequency
    )

print(df_windowed.shape)

(725, 9)


In [35]:
# extract mean and std of gravity acceleration
for col in [f'grav_acceleration_{x}' for x in ['x', 'y', 'z']]:
    for stat in ['mean', 'std']:
        df_windowed[f'{col}_{stat}'] = generate_statistics(
            sensor_col=df_windowed[col],
            statistic=stat
            )

In [36]:
# extract standard deviation of the Euclidean norm of the three axes
df_windowed['std_norm_acc'] = generate_std_norm(
    df=df_windowed,
    cols=[f'acceleration_{x}' for x in ['x', 'y', 'z']]
    )

In [37]:
for col in ['acceleration_x', 'acceleration_y', 'acceleration_z']:

    # fast fourier transforms
    df_windowed[f'{col}_freqs'], df_windowed[f'{col}_fft'] = signal_to_ffts(
        sensor_col=df_windowed[col],
        window_type=window_type,
        sampling_frequency=sampling_frequency
        )

    # compute power in distinct frequency bandwidths
    for bandwidth in d_frequency_bandwidths.keys():
        df_windowed[col+'_'+bandwidth] = df_windowed.apply(lambda x: compute_power_in_bandwidth(
            sensor_col=x[col],
            fmin=d_frequency_bandwidths[bandwidth][0],
            fmax=d_frequency_bandwidths[bandwidth][1],
            sampling_frequency=sampling_frequency,
            window_type=window_type,
            ), axis=1
        )

    # extract dominant frequency
    df_windowed[col+'_dominant_frequency'] = df_windowed.apply(lambda x: get_dominant_frequency(
        signal_ffts=x[col+'_fft'], 
        signal_freqs=x[col+'_freqs'],
        fmin=d_frequency_bandwidths[bandwidth][0],
        fmax=d_frequency_bandwidths[bandwidth][1]
        ), axis=1
    )

In [38]:
for bandwidth in d_frequency_bandwidths.keys():
    df_windowed['total_acc_'+bandwidth] = df_windowed.apply(lambda x: sum(x[y+'_'+bandwidth] for y in l_acceleration_cols), axis=1)

df_windowed['total_accel_power'] = compute_power(
    df=df_windowed,
    fft_cols=[f'{col}_fft' for col in l_acceleration_cols])

cc_cols = generate_cepstral_coefficients(
    total_power_col=df_windowed['total_accel_power'],
    window_length_s=window_length_s,
    sampling_frequency=sampling_frequency,
    low_frequency=low_frequency,
    high_frequency=high_frequency,
    filter_length=filter_length,
    n_dct_filters=n_dct_filters
    )

df_windowed = pd.concat([df_windowed, cc_cols], axis=1)      

In [39]:
df_windowed = df_windowed.rename(columns={f'cc_{cc_nr}': f'cc_{cc_nr}_acc' for cc_nr in range(1,17)}).rename(columns={'window_start': 'time'})

df_windowed = df_windowed.drop(columns=[f'{col}{x}' for x in ['', '_freqs', '_fft', '_fft_power'] for col in l_acceleration_cols] + ['total_accel_power', 'window_nr', 'window_end'] + l_gravity_cols + l_acceleration_cols)

# Store data

In [85]:
from dateutil import parser
import datetime

end_iso8601 = (parser.parse(metadata_samples.start_iso8601) + datetime.timedelta(seconds=int(df_windowed['time'][-1:].values[0] + window_length_s))).strftime('%d-%b-%Y %H:%M:%S') + ' UTC'

metadata_samples.__setattr__('end_iso8601', end_iso8601)
metadata_samples.__setattr__('file_name', 'values.bin')
metadata_samples.__setattr__('file_dir_path', output_path)
metadata_time.__setattr__('end_iso8601', end_iso8601)
metadata_time.__setattr__('file_name', 'time.bin')
metadata_time.__setattr__('file_dir_path', output_path)

metadata_samples.__setattr__('channels', [x for x in df_windowed.columns if x != 'time'])
metadata_samples.__setattr__('units', np.concatenate([np.repeat('m/s^2', 7), np.repeat([np.repeat('X', 4).tolist() + ['Hz']], 3), np.repeat('X', 16)]).tolist())

metadata_time.__setattr__('channels', ['time'])
metadata_time.__setattr__('units', ['s'])
metadata_time.__setattr__('data_type', np.int64)

In [87]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

# store binaries and metadata
tsdf.write_dataframe_to_binaries(output_path, df_windowed, [metadata_time, metadata_samples])
tsdf.write_metadata([metadata_time, metadata_samples], 'meta.json')