#### In this notebook, I'll attempt to derive features with tsfresh from data from multiple participants
Each participant will have data for a set of surfaces. This can add partitioning to our gait data, which probably was missing in notebook "tsfresh_pg.ipynb"

In [1]:
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Data for multiple users
dfs = []
for i in range(1, 5):
    df = pd.read_csv(f"labeled_data/{i}.csv")
    df.insert(0, "user_id", i)
    dfs.append(df)
    
data = pd.concat(dfs, ignore_index=True)
data

Unnamed: 0,user_id,Surface,SensorLocation,PacketCounter,SampleTimeFine,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,...,OriInc_q3,Roll,Pitch,Yaw,Latitude,Longitude,Altitude,Vel_X,Vel_Y,Vel_Z
0,1,FE,Left shank,63872,,9.855517,1.419511,1.056100,0.000000,-0.000000,...,-0.000050,53.299628,-79.819856,-120.515449,,,0.0,,,
1,1,FE,Left shank,63873,,9.618346,1.531807,0.793353,0.385820,-0.297160,...,-0.000047,46.646334,-79.631895,-115.706065,,,0.0,,,
2,1,FE,Left shank,63874,,9.340269,1.537687,0.719518,0.436632,-0.315490,...,-0.000096,46.794904,-79.634026,-115.808157,,,0.0,,,
3,1,FE,Left shank,63875,,9.262140,1.534052,0.871657,0.379882,-0.161510,...,-0.000139,46.968656,-79.627834,-115.908051,,,0.0,,,
4,1,FE,Left shank,63876,,9.087550,2.048158,0.982283,0.825285,0.149745,...,-0.000186,47.134912,-79.615354,-116.007378,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205949,4,SlpD,Left shank,47223,,9.996389,1.835791,-3.213458,1.192275,0.206519,...,0.000641,158.536713,-75.730863,165.515741,,,0.0,,,
2205950,4,SlpD,Left shank,47224,,10.106264,1.124496,-2.693584,0.337476,0.010390,...,0.000736,159.911377,-75.590083,164.943426,,,0.0,,,
2205951,4,SlpD,Left shank,47225,,9.868540,1.362548,-2.244479,0.162789,0.518400,...,0.000809,161.376916,-75.456302,164.347004,,,0.0,,,
2205952,4,SlpD,Left shank,47226,,9.752112,1.465329,-1.954599,0.004665,0.821674,...,0.000821,162.708489,-75.351054,163.826518,,,0.0,,,


In [3]:
for user_id in data.user_id.unique():
    print(f"Surfaces for user id {user_id}:", " ".join(data[data["user_id"] == user_id].Surface.unique()))

Surfaces for user id 1: FE SlpU BnkR StrD CALIB SlpD StrU CS BnkL GR
Surfaces for user id 2: FE SlpU BnkR StrD CALIB SlpD StrU CS BnkL GR
Surfaces for user id 3: FE SlpU BnkR StrD CALIB SlpD StrU CS BnkL GR
Surfaces for user id 4: FE SlpU BnkR StrD CALIB SlpD StrU CS BnkL GR


#### Data cleaning

In [4]:
# Drop NaN columns and columns that don't carry any value
data.dropna(axis=1, how="all", inplace=True)
data.drop(["PacketCounter", "Altitude"], axis=1, inplace=True)
data

Unnamed: 0,user_id,Surface,SensorLocation,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,1,FE,Left shank,9.855517,1.419511,1.056100,0.000000,-0.000000,0.200383,0.016545,...,0.098554,0.014189,0.010577,1.000000,0.000083,-0.000146,-0.000050,53.299628,-79.819856,-120.515449
1,1,FE,Left shank,9.618346,1.531807,0.793353,0.385820,-0.297160,-0.053012,0.061748,...,0.096183,0.015311,0.007952,1.000000,0.000309,-0.000142,-0.000047,46.646334,-79.631895,-115.706065
2,1,FE,Left shank,9.340269,1.537687,0.719518,0.436632,-0.315490,-0.334679,0.084028,...,0.093403,0.015365,0.007214,1.000000,0.000420,-0.000129,-0.000096,46.794904,-79.634026,-115.808157
3,1,FE,Left shank,9.262140,1.534052,0.871657,0.379882,-0.161510,-0.393031,0.130678,...,0.092623,0.015322,0.008735,1.000000,0.000653,-0.000089,-0.000139,46.968656,-79.627834,-115.908051
4,1,FE,Left shank,9.087550,2.048158,0.982283,0.825285,0.149745,-0.483171,0.119649,...,0.090879,0.020459,0.009839,1.000000,0.000598,-0.000040,-0.000186,47.134912,-79.615354,-116.007378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205949,4,SlpD,Left shank,9.996389,1.835791,-3.213458,1.192275,0.206519,0.777854,1.228656,...,0.100005,0.018618,-0.031854,0.999980,0.006143,-0.001667,0.000641,158.536713,-75.730863,165.515741
2205950,4,SlpD,Left shank,10.106264,1.124496,-2.693584,0.337476,0.010390,0.701209,1.430410,...,0.101096,0.011511,-0.026695,0.999973,0.007152,-0.001572,0.000736,159.911377,-75.590083,164.943426
2205951,4,SlpD,Left shank,9.868540,1.362548,-2.244479,0.162789,0.518400,0.384698,1.522516,...,0.098705,0.013875,-0.022203,0.999970,0.007613,-0.001386,0.000809,161.376916,-75.456302,164.347004
2205952,4,SlpD,Left shank,9.752112,1.465329,-1.954599,0.004665,0.821674,0.207085,1.443857,...,0.097532,0.014873,-0.019321,0.999973,0.007219,-0.001211,0.000821,162.708489,-75.351054,163.826518


In [5]:
# Encode string columns

## Surface LE
s_le = preprocessing.LabelEncoder()
data["Surface"] = s_le.fit_transform(data["Surface"])

## Sensor Location LE
sl_le = preprocessing.LabelEncoder()
data["SensorLocation"] = sl_le.fit_transform(data["SensorLocation"])

In [6]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
data[[c for c in data.columns[3:]]].describe()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
count,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,...,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0
mean,9.28328,-0.21772,-0.7655,-0.10403,0.12382,-0.05154,-0.00073,0.00526,0.01228,-0.7328,...,0.09355,-0.00106,-0.00809,0.99989,-0.0,3e-05,6e-05,-9.06888,-63.53837,6.74081
std,130.77302,176.22476,78.43782,118.79273,119.60539,98.74853,1.65809,2.25147,0.94419,0.40949,...,0.92762,0.76833,1.53842,0.00053,0.00832,0.01128,0.00473,117.35392,34.31752,98.77096
min,-137694.66228,-185464.14533,-127.33968,-124612.79397,-257.6199,-103428.49476,-195.98288,-75.87994,-7.5989,-2.70557,...,-1169.96755,-1131.67761,-1242.8269,0.47981,-0.81129,-0.31099,-0.10222,-179.99999,-89.98266,-179.99988
25%,7.32131,-1.63826,-2.52039,-1.35913,-2.01308,-2.19203,-0.68764,-1.2475,-0.28848,-0.91626,...,0.0732,-0.01642,-0.02535,0.99987,-0.00344,-0.00624,-0.00145,-128.44487,-80.472,-76.75096
50%,9.66997,-0.19378,-0.58171,-0.00191,0.01428,0.00524,-0.00251,-0.09559,0.00161,-0.81494,...,0.09671,-0.00191,-0.00565,0.99996,-1e-05,-0.00048,1e-05,-7.40752,-74.03315,-14.68869
75%,12.4242,1.42952,1.54936,1.36136,2.05994,2.16895,0.70238,0.4907,0.28006,-0.70337,...,0.12434,0.01438,0.01577,0.99999,0.00352,0.00246,0.0014,82.08667,-62.75843,100.56133
max,128.85882,89.12148,85776.14617,104.20517,125381.67297,1898.52836,200.02696,71.96313,7.53216,3.22095,...,18.39336,1.81425,1913.78768,1.0,0.81981,0.2979,0.06324,179.99999,89.89615,179.99969


In [11]:
# Butterworth filter
from scipy import signal

columns_to_filter = ["Acc_X", "Acc_Y", "Acc_Z", "FreeAcc_X", "FreeAcc_Y", "FreeAcc_Z", "Gyr_X", "Gyr_Y", "Gyr_Z", "VelInc_X", "VelInc_Y", "VelInc_Z"]
sos = signal.butter(2, 6, btype="lp", fs=100, output="sos")

for user_id in data.user_id.unique():
    for c in columns_to_filter:
        data.loc[data.user_id == user_id, c] = signal.sosfilt(sos, data.loc[data.user_id == user_id, c])

In [12]:
data[[c for c in data.columns[3:]]].describe()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
count,37870.0,37870.0,37870.0,37870.0,37870.0,37870.0,37870.0,37870.0,37870.0,2203722.0,...,37870.0,37870.0,37870.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0,2203722.0
mean,9.64369,0.1349,-0.61312,0.02499,0.01746,-0.00247,0.01522,0.00727,-0.02015,-0.7328,...,0.0964,0.00132,-0.00613,0.99989,-0.0,3e-05,6e-05,-9.06888,-63.53837,6.74081
std,5.11379,2.13928,3.89229,2.60496,4.40175,4.02941,0.88969,1.87575,0.54305,0.40949,...,0.05109,0.02137,0.0393,0.00053,0.00832,0.01128,0.00473,117.35392,34.31752,98.77096
min,-19.67423,-20.30039,-28.4386,-14.55822,-24.73766,-26.04765,-4.92158,-5.75437,-4.58586,-2.70557,...,-0.19678,-0.203,-0.28606,0.47981,-0.81129,-0.31099,-0.10222,-179.99999,-89.98266,-179.99988
25%,8.03159,-1.04506,-2.21172,-0.89585,-1.32371,-1.67966,-0.36925,-0.64531,-0.1569,-0.91626,...,0.08023,-0.01049,-0.02209,0.99987,-0.00344,-0.00624,-0.00145,-128.44487,-80.472,-76.75096
50%,9.69751,0.12516,-0.67643,0.02183,0.03466,0.01983,0.00798,-0.03011,-0.00246,-0.81494,...,0.09698,0.00124,-0.00668,0.99996,-1e-05,-0.00048,1e-05,-7.40752,-74.03315,-14.68869
75%,11.93785,1.25633,1.28206,1.10724,1.23069,1.92699,0.39826,0.26033,0.14848,-0.70337,...,0.11951,0.01256,0.01293,0.99999,0.00352,0.00246,0.0014,82.08667,-62.75843,100.56133
max,28.94291,15.78688,20.24327,17.38499,26.7106,18.3327,4.79443,7.98765,5.78088,3.22095,...,0.29199,0.1556,0.20153,1.0,0.81981,0.2979,0.06324,179.99999,89.89615,179.99969


In [19]:
data

Unnamed: 0,user_id,Surface,SensorLocation,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,1,4,0,0.27457,0.03955,0.02942,0.00000,0.00000,0.00558,0.00046,...,0.00275,0.00040,0.00029,1.00000,0.00008,-0.00015,-0.00005,53.29963,-79.81986,-120.51545
1,1,4,0,1.22224,0.18012,0.12436,0.01075,-0.00828,0.01793,0.00332,...,0.01222,0.00180,0.00125,1.00000,0.00031,-0.00014,-0.00005,46.64633,-79.63190,-115.70606
2,1,4,0,2.71295,0.41029,0.25990,0.04952,-0.03756,0.01648,0.01087,...,0.02713,0.00410,0.00260,1.00000,0.00042,-0.00013,-0.00010,46.79490,-79.63403,-115.80816
3,1,4,0,4.33200,0.67076,0.39696,0.11242,-0.08092,-0.01728,0.02414,...,0.04332,0.00670,0.00398,1.00000,0.00065,-0.00009,-0.00014,46.96866,-79.62783,-115.90805
4,1,4,0,5.82897,0.93426,0.52915,0.19313,-0.11097,-0.07986,0.04219,...,0.05829,0.00934,0.00530,1.00000,0.00060,-0.00004,-0.00019,47.13491,-79.61535,-116.00738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205949,4,6,0,,,,,,,,...,,,,0.99998,0.00614,-0.00167,0.00064,158.53671,-75.73086,165.51574
2205950,4,6,0,,,,,,,,...,,,,0.99997,0.00715,-0.00157,0.00074,159.91138,-75.59008,164.94343
2205951,4,6,0,,,,,,,,...,,,,0.99997,0.00761,-0.00139,0.00081,161.37692,-75.45630,164.34700
2205952,4,6,0,,,,,,,,...,,,,0.99997,0.00722,-0.00121,0.00082,162.70849,-75.35105,163.82652


In [26]:
# Add Time column
data.insert(0, "Time", -1)
for user_id in data.user_id.unique():
    data.loc[data.user_id == user_id, "Time"] = list(range(1, len(data.loc[data.user_id == user_id, "Acc_X"]) + 1))

In [32]:
data

Unnamed: 0,Time,user_id,Surface,SensorLocation,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,1,1,4,0,0.27457,0.03955,0.02942,0.00000,0.00000,0.00558,...,0.00275,0.00040,0.00029,1.00000,0.00008,-0.00015,-0.00005,53.29963,-79.81986,-120.51545
1,2,1,4,0,1.22224,0.18012,0.12436,0.01075,-0.00828,0.01793,...,0.01222,0.00180,0.00125,1.00000,0.00031,-0.00014,-0.00005,46.64633,-79.63190,-115.70606
2,3,1,4,0,2.71295,0.41029,0.25990,0.04952,-0.03756,0.01648,...,0.02713,0.00410,0.00260,1.00000,0.00042,-0.00013,-0.00010,46.79490,-79.63403,-115.80816
3,4,1,4,0,4.33200,0.67076,0.39696,0.11242,-0.08092,-0.01728,...,0.04332,0.00670,0.00398,1.00000,0.00065,-0.00009,-0.00014,46.96866,-79.62783,-115.90805
4,5,1,4,0,5.82897,0.93426,0.52915,0.19313,-0.11097,-0.07986,...,0.05829,0.00934,0.00530,1.00000,0.00060,-0.00004,-0.00019,47.13491,-79.61535,-116.00738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205949,550772,4,6,0,,,,,,,...,,,,0.99998,0.00614,-0.00167,0.00064,158.53671,-75.73086,165.51574
2205950,550773,4,6,0,,,,,,,...,,,,0.99997,0.00715,-0.00157,0.00074,159.91138,-75.59008,164.94343
2205951,550774,4,6,0,,,,,,,...,,,,0.99997,0.00761,-0.00139,0.00081,161.37692,-75.45630,164.34700
2205952,550775,4,6,0,,,,,,,...,,,,0.99997,0.00722,-0.00121,0.00082,162.70849,-75.35105,163.82652
