In [8]:
#!pip install tsfel

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns 
import tsfel # time series feature extraction library 

# dont work hard, work smart!
# feature extraction with time series feature extraction library (tsfel)

* tsfel creates a rolling window from which summary statistics and features may be derived
    * this notebook uses a 2s window (200 observations sampled at 100hz) with a 50% overlap
* Kumpulainena et al., (2021) extracted features manually. I'm going to use a ready-made library to do the heavy lifting for me. 
* features used Kumpulainena et al., (2021) were as follows: 
    * total activity: sum of SD in all 3 axes
    * position offset: euclidean distance from robust mean (standing) - recalculate for each subject dog
    * count mean crossings: sum of all 3 axes 
    * mean value of each axis 
    * 7 interpolated inverse ecdf values per axis

In [9]:
df = pd.read_csv('DietDogMoveData.csv') 

following '1.preprocessing.ipynb', the dataset has been reduced from 10M+ rows to ~6M. initially i passed the entire dataset (6M rows) into the tsfel feature extraction function. this was not practical and either would not process at all or would stall after several hours. to solve this problem, i broke the dataset down into more managable chunks and performed the feature extraction on each section before concatenating them back together. 

* objectives for this solution are as follows:
    * group dataset by dogID
    * further divide dog-wise subsection into inertial data (acc, gyro signals) and behavioral (target column and dogID) sections 
    * parse data (inertial and behavioural) through TSFEL dog-wise. 
        * custom .json files created to specifiy which features needed to be extracted from each subsection 
    * merge inertial and behavioural sections horizontally following feature extraction 
    * concatenate dog-wise dataframes vertically to reconstruct the complete dataset 

In [10]:
# although a classification model does not reqire numerically encoded target variables, tsfel does 
# rolling mean of these encoded values will be produced by the feature extraction function
df = pd.get_dummies(df, columns=['Behavior'], prefix=['Behavior']) # one-hot encode behavior classes

In [11]:
df_grouped = df.groupby('DogID')
dog_groups = [df_grouped.get_group(x) for x in df_grouped.groups] # create dog-wise iterable

In [18]:
# tsfel feature extraction function takes a .json config file which dictates which features are extracted 
# features used for the inertial data are in DogMoveFeatures.json (ecdf percentiles *7, mean, standard deviation and zero crossing rate (ZCR))
# DogBehaviorFeatures.json selects only the rolling mean

# generate 7 evenly-spaced percentiles between 0 and 1 to pass to ecdf_percentile feature (per Kumpulainena et al., (2021))
# add percentiles to './DogMoveFeatures.json'
print(np.linspace(0,1,9)) 

[0.    0.125 0.25  0.375 0.5   0.625 0.75  0.875 1.   ]


In [12]:
# config files for feature extraction 
config_imu = tsfel.get_features_by_domain(json_path='./DogMoveFeatures.json') # tsfel config file modified to select statistical features + ecdf functions 
config_behavior = tsfel.get_features_by_domain(json_path='./DogBehaviorFeatures.json') # tsfel config file modified to select rolling mean only for behavior labels

final_df = pd.DataFrame() #initalise empty dataframe 

for group in dog_groups: # iterate over subsections dog-wise
    # split dataframe 
    data_imu = group.iloc[:, 2:8] # slice to isolate inertial measurements 
    data_label_ID = group.iloc[:,[0,8,9,10,11]] # slice to isolate behavior behaviour labels and dogID 

    # parse into tsfel.time_series_features_extractor
    data_imu_extracted = tsfel.time_series_features_extractor(config_imu, data_imu, fs=100, window_size=200, overlap=0.5, header_names=['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz']) # extract features, window params
    data_label_ID_extracted = tsfel.time_series_features_extractor(config_behavior, data_label_ID, fs=100, window_size=200, overlap=0.5, header_names=['DogID', 'Behavior_Lying', 'Behavior_Moving', 'Behavior_Sitting', 'Behavior_Standing']) #extract rolling mean

    # stick dataframes back together - result is a df with extracted features for IMU data and rolling windowed mean for ID and behavior columns 
    rejoined = pd.concat([data_imu_extracted, data_label_ID_extracted], axis=1) # join by columns
    final_df = final_df.append(rejoined) # join by rows 
    print(final_df.shape)

*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(2188, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(3210, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(4120, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(5140, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(6096, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(7129, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(9005, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(10926, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(13003, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(15229, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(16274, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(18339, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(20264, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(21353, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(23607, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(24549, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(26727, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(28520, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(30888, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(32031, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(34540, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(36845, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(38900, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(41147, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(42999, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(43944, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(44894, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(45953, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(46810, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(47748, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(48858, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(50101, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(51173, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(52164, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(53105, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(54222, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(55260, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(56275, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(57465, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(58526, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(59640, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(60939, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(62140, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(63491, 65)
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(65497, 65)


In [17]:
len(np.unique(final_df['DogID_Mean'])) # all 45 dogs made it safely through the slicing and sticking operation ! 

45

In [14]:
final_df.to_csv('DogMoveDataExtracted.csv', index=False)