In [26]:
# feature extractor

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [27]:
# function to only get relevant correlations
def get_redundant_pairs(df):
    
    # Get diagonal and lower triangular pairs of correlation matrix 
    
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop




def extract_features(file_name):        
     
    
    cols = ["emg1","emg2","emg3","emg4","airborne", "acc_u_x" , "acc_u_y" , "acc_u_z" , "gonio_x",
               "acc_l_x","acc_l_y" , "acc_l_z", "gonio_y","gyro_u_x", "gyro_u_y","gyro_u_z",
               "gyro_l_x","gyro_l_y","gyro_l_z"]
    
    raw_data = pd.read_csv(file_name, header=None)   
    raw_data.columns = cols
    
    #scale all data
    fit_df = raw_data.subtract(raw_data.mean())
    
    #create new df for emg at 1000Hz and other sensors at 100 Hz
    emg = fit_df.iloc[:,:4]
    other = fit_df.iloc[::10,4:].reset_index(drop = True)
    
    # make columns
    features_list = ["mean", "var", "mad", "rms", "iqr", "PE", "kurtosis", "Max", "Min"]

    list_names = ["id", "n_rows"]

    # feature column names
    for feature in features_list:
        for col in cols:
            list_names.append(col+"_"+feature)
    

    # Compute the values for each column and attach to list
    stats_list = []
    
    #id
    stats_list.append(str(file_name)[25:])
    
    #n_rows
    stats_list.append(len(raw_data))    
 
      
    #Mean, variance, mean absolute deviation and root mean square are statistical indicators which give information about sample distribution.
    #mean
    stats_list.extend(emg.mean().tolist())
    stats_list.extend(other.mean().tolist())

    #variance
    stats_list.extend(emg.var().tolist())
    stats_list.extend(other.var().tolist())

    #mean absolute deviation
    stats_list.extend(emg.mad().tolist())
    stats_list.extend(other.mad().tolist())

    #root mean square
    rms = lambda x: np.sqrt((x**2).sum()/len(x))
    stats_list.extend(rms(emg).tolist())
    stats_list.extend(rms(other).tolist())    

    #iqr. It measures the spread of a data set over a range
    stats_list.extend((emg.quantile(0.75) - emg.quantile(0.25)).tolist())
    stats_list.extend((other.quantile(0.75) - other.quantile(0.25)).tolist())

    #75th percentile (PE)
    stats_list.extend(emg.quantile(0.75).tolist())
    stats_list.extend(other.quantile(0.75).tolist())

    #kurtosis (peakedness of the distribution)
    stats_list.extend(emg.kurtosis().tolist())
    stats_list.extend(other.kurtosis().tolist())

    #Max
    stats_list.extend(emg.max().tolist())
    stats_list.extend(other.max().tolist())
    
    #Min
    stats_list.extend(emg.min().tolist())
    stats_list.extend(other.min().tolist())
    
    #correlation bw each column
    s_e = emg.corr().unstack()
    s_e.drop(labels = get_redundant_pairs(emg), inplace = True)

    for indx1, indx2 in s_e.index:
        if indx1 != indx2:
            stats_list.append(s_e[indx1][indx2])


    s_o = other.corr().unstack()
    s_o.drop(labels = get_redundant_pairs(other), inplace = True)

    for indx1, indx2 in s_o.index:
        if indx1 != indx2:
            stats_list.append(s_o[indx1][indx2])

    #make cor names
    for indx1, indx2 in s_e.index:
        list_names.append("cor_"+str(indx1)+"_"+str(indx2))
    for indx1, indx2 in s_o.index:
        list_names.append("cor_"+str(indx1)+"_"+str(indx2))

    
    
    # create dict
    final_dict = {}
    for i in range(len(list_names)):
        final_dict.update({list_names[i]: stats_list[i]})

    return final_dict


In [29]:
# set the variables for the parent data folder and train file
dataFolder = 'testf_subjects/'

# make a list of all the csv files in the parent and sub directories
result = list(Path(dataFolder).rglob("Subject*.[cC][sS][vV]"))

df = pd.DataFrame(extract_features(result[0]), index = [0])

for i in range(1, len(result)):
    df = df.append(extract_features(result[i]), ignore_index = True)


df.to_csv('test_centered_extracted_feature.csv', index=False)