This notebooks takes a sample of the dataset (for the user with id: 1) and applies Butterworth filter to it

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
data = pd.read_csv("../labeled_data/1.csv")
print("Shape", data.shape)
data.describe()

In [5]:
# Drop NaN columns
data.dropna(axis=1, how="all", inplace=True)
# Reinspect the data
data.describe()

**Data ranges are quite wide in some columns, e.g. Acc_X (min: -34, median: 9.33, max: 103.96), meaning that we should scale to improve model performance**

In [7]:
data.sample(5)

Unnamed: 0,Surface,SensorLocation,PacketCounter,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw,Altitude
445853,FE,Wrist,1865,-8.77704,1.515643,1.623755,-0.431544,-0.114786,-0.769973,1.203194,-0.157714,0.123769,0.89917,0.175049,-0.319824,-0.087793,0.015004,0.016259,0.999981,0.006016,-0.000789,0.000619,54.667866,75.449349,-63.751235,0.0
27115,CALIB,Left shank,32370,9.585855,2.040662,1.556372,0.034695,0.217064,0.108346,-0.015525,-0.023371,-0.008989,-0.835938,0.227783,-0.099854,0.095858,0.020404,0.015573,1.0,-7.8e-05,-0.000117,-4.5e-05,56.393134,-75.878093,-148.089206,0.0
122967,StrU,Left thigh,51296,2.686236,-0.239472,0.297322,1.129819,1.504742,-7.858094,-1.190471,2.775709,0.071911,-0.730225,-0.903076,-1.046631,0.026901,-0.002369,0.002614,0.999886,-0.005952,0.013878,0.00036,-2.292033,-39.025269,60.083095,0.0
368915,SlpD,Trunk,32090,4.057778,-0.991589,-1.90364,0.498861,0.859253,-5.331008,0.741945,0.146252,0.011996,-0.905762,-0.412842,0.581299,0.040564,-0.009843,-0.019103,0.999993,0.00371,0.000731,6e-05,-174.804069,-71.084162,-64.214549,0.0
278632,FE,Right thigh,7373,8.923746,-0.332317,0.21752,-0.732345,2.841671,-1.375962,-1.629628,-1.29401,0.128981,-0.804932,0.485352,0.150635,0.089223,-0.003245,0.002779,0.999946,-0.008148,-0.00647,0.000645,-175.527599,-71.780748,97.59159,0.0


In [8]:
# Some columns are just meaningless
print(data[["PacketCounter"]].join(data[["Altitude"]]))
data.drop(["PacketCounter", "Altitude"], axis=1, inplace=True)

        PacketCounter  Altitude
0               63872       0.0
1               63873       0.0
2               63874       0.0
3               63875       0.0
4               63876       0.0
...               ...       ...
558709          25830       0.0
558710          25831       0.0
558711          25832       0.0
558712          25833       0.0
558713          25834       0.0

[558714 rows x 2 columns]


#### Bring all columns to numeric data formats

In [9]:
data["Surface"], _ = data["Surface"].factorize()
data["SensorLocation"], _ = data["SensorLocation"].factorize()

In [10]:
data.dtypes

Surface             int64
SensorLocation      int64
Acc_X             float64
Acc_Y             float64
Acc_Z             float64
FreeAcc_X         float64
FreeAcc_Y         float64
FreeAcc_Z         float64
Gyr_X             float64
Gyr_Y             float64
Gyr_Z             float64
Mag_X             float64
Mag_Y             float64
Mag_Z             float64
VelInc_X          float64
VelInc_Y          float64
VelInc_Z          float64
OriInc_q0         float64
OriInc_q1         float64
OriInc_q2         float64
OriInc_q3         float64
Roll              float64
Pitch             float64
Yaw               float64
dtype: object

In [11]:
data

Unnamed: 0,Surface,SensorLocation,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,0,0,9.855517,1.419511,1.056100,0.000000,-0.000000,0.200383,0.016545,-0.029152,-0.009963,-0.963623,0.235596,-0.266357,0.098554,0.014189,0.010577,1.000000,0.000083,-0.000146,-0.000050,53.299628,-79.819856,-120.515449
1,0,0,9.618346,1.531807,0.793353,0.385820,-0.297160,-0.053012,0.061748,-0.028399,-0.009419,-0.958984,0.235596,-0.266357,0.096183,0.015311,0.007952,1.000000,0.000309,-0.000142,-0.000047,46.646334,-79.631895,-115.706065
2,0,0,9.340269,1.537687,0.719518,0.436632,-0.315490,-0.334679,0.084028,-0.025894,-0.019274,-0.956787,0.237793,-0.266357,0.093403,0.015365,0.007214,1.000000,0.000420,-0.000129,-0.000096,46.794904,-79.634026,-115.808157
3,0,0,9.262140,1.534052,0.871657,0.379882,-0.161510,-0.393031,0.130678,-0.017727,-0.027842,-0.956787,0.244385,-0.268799,0.092623,0.015322,0.008735,1.000000,0.000653,-0.000089,-0.000139,46.968656,-79.627834,-115.908051
4,0,0,9.087550,2.048158,0.982283,0.825285,0.149745,-0.483171,0.119649,-0.008019,-0.037221,-0.954346,0.237793,-0.266357,0.090879,0.020459,0.009839,1.000000,0.000598,-0.000040,-0.000186,47.134912,-79.615354,-116.007378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558709,5,0,12.380745,5.285336,-1.026498,1.841389,0.373530,3.556620,-0.113998,-1.496558,0.001963,-0.900635,0.083740,-0.221680,0.123879,0.052849,-0.009368,0.999972,-0.000570,-0.007483,0.000010,101.118168,-74.405827,-166.211231
558710,5,0,11.609036,2.817452,-1.885530,0.166782,-1.240775,2.216199,-1.263676,-1.431789,0.011664,-0.907715,0.096680,-0.204834,0.116221,0.028065,-0.018201,0.999954,-0.006318,-0.007159,0.000058,103.259561,-74.237416,-169.185534
558711,5,0,10.621329,1.171285,-2.344570,-0.925919,-2.178039,0.868089,-1.487070,-1.169878,-0.289941,-0.909668,0.074951,-0.197510,0.106365,0.011387,-0.022909,0.999954,-0.007435,-0.005849,-0.001450,104.609544,-73.906827,-171.468586
558712,5,0,10.188926,0.569020,-2.817934,-1.261935,-2.763749,0.328654,-0.591665,-0.954093,-0.473706,-0.923828,0.087646,-0.184814,0.102035,0.005366,-0.027709,0.999981,-0.002958,-0.004770,-0.002369,105.833962,-73.499663,-173.095611


In [12]:
# There are still some NaN rows, which we'll get rid off
na_rows = data[data.Acc_X.isna()]
print("Total NaN rows: ", len(na_rows))
print("Samples of NaN rows:")
na_rows[:5]

Total NaN rows:  58
Samples of NaN rows:


Unnamed: 0,Surface,SensorLocation,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
13557,5,5,,,,,,,,,,,,,,,,,,,,,,
13558,5,5,,,,,,,,,,,,,,,,,,,,,,
13559,5,5,,,,,,,,,,,,,,,,,,,,,,
13560,5,5,,,,,,,,,,,,,,,,,,,,,,
15849,5,3,,,,,,,,,,,,,,,,,,,,,,


In [13]:
data.dropna(axis=0, inplace=True)

In [14]:
# Count rows after dropping NaN rows
data.shape

(558656, 24)

#### Before scaling, let's apply Butterworth filter

In [15]:
from scipy import signal

sos = signal.butter(2, 6, btype="lp", fs=100, output="sos")
for c in data.columns[2:]:
    data[c] = signal.sosfilt(sos, data[c])

In [16]:
data

Unnamed: 0,Surface,SensorLocation,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,0,0,0.274572,0.039547,0.029423,0.000000,0.000000,0.005583,0.000461,-0.000812,-0.000278,-0.026846,0.006564,-0.007421,0.002746,0.000395,0.000295,0.027860,0.000002,-0.000004,-0.000001,1.484915,-2.223763,-3.357532
1,0,0,1.222236,0.180121,0.124361,0.010749,-0.008279,0.017925,0.003322,-0.003614,-0.001227,-0.120021,0.029375,-0.033211,0.012222,0.001800,0.001246,0.124686,0.000017,-0.000018,-0.000006,6.460350,-9.947169,-14.892581
2,0,0,2.712953,0.410293,0.259897,0.049522,-0.037562,0.016477,0.010874,-0.007971,-0.002987,-0.268268,0.065806,-0.074329,0.027129,0.004101,0.002604,0.279059,0.000054,-0.000040,-0.000015,14.048316,-22.251084,-33.034120
3,0,0,4.331996,0.670756,0.396961,0.112421,-0.080921,-0.017284,0.024137,-0.012369,-0.005799,-0.432067,0.106477,-0.119930,0.043320,0.006704,0.003978,0.450005,0.000121,-0.000062,-0.000029,22.151792,-35.866961,-52.905943
4,0,0,5.828968,0.934256,0.529149,0.193133,-0.110968,-0.079858,0.042188,-0.015503,-0.009929,-0.586609,0.145348,-0.163148,0.058290,0.009336,0.005302,0.611627,0.000211,-0.000078,-0.000050,29.673240,-48.734880,-71.589983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558709,5,0,10.964821,-0.021870,-4.546547,-0.945116,-5.098233,0.546932,0.743814,-2.219710,-0.728531,-0.894850,0.119405,-0.260375,0.110250,-0.001151,-0.043889,0.999608,0.003721,-0.011096,-0.003643,84.097977,-76.051035,-153.345997
558710,5,0,11.195818,0.945932,-3.477958,-0.702196,-3.628974,1.128230,1.284100,-1.859240,-0.690389,-0.900004,0.110911,-0.244720,0.112378,0.008737,-0.033410,0.999704,0.006421,-0.009295,-0.003452,89.171513,-75.808127,-157.842392
558711,5,0,11.371426,1.745407,-2.662180,-0.446572,-2.481636,1.590433,1.343089,-1.594546,-0.598439,-0.903746,0.103374,-0.231352,0.113994,0.016921,-0.025449,0.999789,0.006716,-0.007972,-0.002992,93.697180,-75.486002,-161.726226
558712,5,0,11.406374,2.179738,-2.148389,-0.328878,-1.764614,1.803741,1.093489,-1.393152,-0.506810,-0.906942,0.096742,-0.219585,0.114246,0.021404,-0.020497,0.999858,0.005467,-0.006965,-0.002534,97.565948,-75.118826,-165.073126


In [17]:
# Reinspect data
data.describe()

Unnamed: 0,Surface,SensorLocation,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
count,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0,558656.0
mean,4.348073,2.499876,7.265727,0.567752,-0.660281,0.025788,0.011237,0.00766,-0.004179,0.011058,0.00374,-0.557319,-0.042845,-0.017552,0.072628,0.005707,-0.006537,0.999883,-2.2e-05,5.5e-05,1.9e-05,-4.793356,-47.203209,5.861633
std,2.905121,1.707821,8.614531,2.550242,4.420545,3.27488,5.077369,4.497234,1.059509,2.153199,1.06988,0.635438,0.503457,0.403099,0.086213,0.025894,0.044602,0.002238,0.005305,0.010768,0.00535,112.390474,53.788027,93.967385
min,0.0,0.0,-34.242811,-16.430477,-31.334842,-29.697894,-30.389459,-26.869339,-8.140505,-7.484386,-4.53006,-2.292965,-1.877459,-3.943682,-0.348027,-0.159699,-0.320558,0.02786,-0.040689,-0.03741,-0.02553,-196.286511,-94.688277,-196.132527
25%,2.0,1.0,5.254875,-0.897328,-2.476811,-0.939218,-1.669901,-1.835062,-0.567253,-1.151508,-0.326936,-0.908822,-0.474406,-0.297045,0.052416,-0.009007,-0.024787,0.999859,-0.002839,-0.005763,-0.001636,-121.563393,-78.613913,-75.258293
50%,5.0,2.0,9.477075,0.344384,-0.583415,0.024018,0.002227,0.026317,-0.006245,-0.096573,-0.014353,-0.79544,-0.064668,-0.00761,0.09478,0.00343,-0.005542,0.999966,-3.1e-05,-0.000483,-7.2e-05,4.092174,-71.090969,-6.093169
75%,7.0,4.0,11.933736,1.965163,1.503081,0.994179,1.651147,2.333952,0.590471,0.316637,0.222279,-0.617712,0.436947,0.259388,0.119463,0.019719,0.015295,0.999994,0.002954,0.001585,0.001113,62.945688,-51.916508,97.013466
max,9.0,5.0,33.277815,21.788693,23.332535,28.16193,31.168744,121.969546,11.003526,10.484148,6.992601,3.085087,1.718803,1.97899,1.276434,0.224424,0.230868,1.045715,0.054988,0.052397,0.034952,196.788804,89.633476,196.793206


**Value ranges are a bit narrower now**

Dump filtered dataset into a CSV file for further analysis

In [None]:
data.to_csv("filtered_dataset_sample.csv", index=False)