In [1]:
import config
import glob
import pandas as pd

In [2]:
# modules used for easier display of data
from IPython.display import display
from IPython.core.display import HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Notes:

- Previous Kata we learned that sensor readings vary significantly per participant, and that there may be other features we can extract.
- Today we will focus on using data from one participant and use some guidance from the following publications:
    - http://web2.utc.edu/~swf134/research/BSN_Event_Classification_Cameral_Ready_2010.pdf
    - http://cs.gmu.edu/~jessica/SAX_DAMI_preprint.pdf
- Ideas:
    - We could use TSFRESH package (Time Series Feature extraction based on scalable hypothesis tests)
    - Manually select features:
        - Windowed mean/median/std/max/min
        - Sensor rate-of-change (explored in kata_04212017), in this case this is the 'snap' measurement
    - Can we do an adaptive window function based on how many features are in a given time range?
        

### Accomplished:
- Successfully increased accuracy using windowed statistics from 78% to 85%
- We noted that the 'snap' measurement does not have a high ranking as an important feature

### Load Data

In [3]:
path_to_files = glob.glob(config.data_dir+'*.csv')
dfs = []
for i,f in enumerate(path_to_files):
    df_i = pd.read_csv(f,names=config.cols)
    
    # create column containing participant id
    df_i['participant_id'] = i
    dfs.append(df_i)
    
    print('participant {} data:\t'.format(i),df_i.shape[0])
df = pd.concat(dfs).reset_index()
print('\nTotal combined data: ',df.shape)

participant 0 data:	 166741
participant 1 data:	 140901
participant 2 data:	 126801
participant 3 data:	 122201
participant 4 data:	 67651
participant 5 data:	 138001
participant 6 data:	 160001
participant 7 data:	 114702
participant 8 data:	 103501
participant 9 data:	 102341
participant 10 data:	 162501
participant 11 data:	 163001
participant 12 data:	 116101
participant 13 data:	 104451
participant 14 data:	 138001

Total combined data:  (1926896, 7)


In [6]:
# We will select the subset for the following participant
df_0 = df[df['participant_id']==0].copy()
print(df_0.shape)

(166741, 7)


### Feature Engineering

- Manually create a windowing function to apply stats

In [50]:
WINDOW_SIZE = int(config.sampling_freq*3.0)
print("window size: ",WINDOW_SIZE)
df_0['x_win_mean'] = df_0[['x_acceleration']].rolling(WINDOW_SIZE,min_periods=1).mean()
df_0['y_win_mean'] = df_0[['y_acceleration']].rolling(WINDOW_SIZE,min_periods=1).mean()
df_0['z_win_mean'] = df_0[['z_acceleration']].rolling(WINDOW_SIZE,min_periods=1).mean()

df_0['x_win_median'] = df_0[['x_acceleration']].rolling(WINDOW_SIZE,min_periods=1).median()
df_0['y_win_median'] = df_0[['y_acceleration']].rolling(WINDOW_SIZE,min_periods=1).median()
df_0['z_win_median'] = df_0[['z_acceleration']].rolling(WINDOW_SIZE,min_periods=1).median()

df_0['x_win_max'] = df_0[['x_acceleration']].rolling(WINDOW_SIZE,min_periods=1).max()
df_0['y_win_max'] = df_0[['y_acceleration']].rolling(WINDOW_SIZE,min_periods=1).max()
df_0['z_win_max'] = df_0[['z_acceleration']].rolling(WINDOW_SIZE,min_periods=1).max()


df_0['x_win_min'] = df_0[['x_acceleration']].rolling(WINDOW_SIZE,min_periods=1).min()
df_0['y_win_min'] = df_0[['y_acceleration']].rolling(WINDOW_SIZE,min_periods=1).min()
df_0['z_win_min'] = df_0[['z_acceleration']].rolling(WINDOW_SIZE,min_periods=1).min()

df_0['x2'] = df_0['x_acceleration'].diff()
df_0['y2'] = df_0['y_acceleration'].diff()
df_0['z2'] = df_0['z_acceleration'].diff()

df_0 = df_0.dropna()
df_0.head()

window size:  156


Unnamed: 0,index,sequential_number,x_acceleration,y_acceleration,z_acceleration,label,participant_id,x_win_mean,y_win_mean,z_win_mean,...,z_win_median,x_win_max,y_win_max,z_win_max,x_win_min,y_win_min,z_win_min,x2,y2,z2
1,1,1.0,2249,2677,2046,1,0,2268.0,2812.5,2117.5,...,2117.5,2287.0,2948.0,2189.0,2249.0,2677.0,2046.0,-38.0,-271.0,-143.0
2,2,2.0,2270,2568,2005,1,0,2268.666667,2731.0,2080.0,...,2046.0,2287.0,2948.0,2189.0,2249.0,2568.0,2005.0,21.0,-109.0,-41.0
3,3,3.0,2222,2565,2003,1,0,2257.0,2689.5,2060.75,...,2025.5,2287.0,2948.0,2189.0,2222.0,2565.0,2003.0,-48.0,-3.0,-2.0
4,4,4.0,2235,2571,2074,1,0,2252.6,2665.8,2063.4,...,2046.0,2287.0,2948.0,2189.0,2222.0,2565.0,2003.0,13.0,6.0,71.0
5,5,5.0,2205,2559,2075,1,0,2244.666667,2648.0,2065.333333,...,2060.0,2287.0,2948.0,2189.0,2205.0,2559.0,2003.0,-30.0,-12.0,1.0


In [51]:
X = df_0[['x_acceleration','y_acceleration','z_acceleration',
        'x_win_mean','y_win_mean','z_win_mean',
        'x_win_median','y_win_median','z_win_median',
        'x_win_max','y_win_max','z_win_max',
        'x_win_min','y_win_min','z_win_min',
         'x2','y2','z2']]
y = df_0['label']

In [52]:
X_raw = df_0[['x_acceleration','y_acceleration','z_acceleration']]
y_raw = df_0['label']

### ML with cross-validation

In [25]:
from sklearn.ensemble import  RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [26]:
clf_rf = RandomForestClassifier(n_jobs=-1,n_estimators=10,random_state=2)

#### Using raw features

In [54]:
%%time
rf_scores = cross_val_score(clf_rf, X_raw, y_raw, cv=5, n_jobs =-1)
print(rf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (rf_scores.mean(), rf_scores.std() * 2))

[ 0.68364869  0.8311443   0.84298908  0.80874415  0.72489279]
Accuracy: 0.78 (+/- 0.13)
CPU times: user 164 ms, sys: 52 ms, total: 216 ms
Wall time: 4.89 s


#### Using preprocessed features

In [53]:
%%time
rf_scores = cross_val_score(clf_rf, X, y, cv=5, n_jobs =-1)
print(rf_scores)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (rf_scores.mean(), rf_scores.std() * 2))

[ 0.74290084  0.92989085  0.91465755  0.88017272  0.76837497]
Accuracy: 0.85 (+/- 0.15)

CPU times: user 348 ms, sys: 68 ms, total: 416 ms
Wall time: 7.65 s


### Feature Ranking

In [55]:
import numpy as np

clf_rf.fit(X,y)
print("Feature ranking:")
importances = clf_rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print("{}.\tfeature {}\t({})".format(f + 1, X.columns[int(indices[f])], importances[indices[f]]))

Feature ranking:
1.	feature z_win_min	(0.16798239935779186)
2.	feature z_win_median	(0.14005478480057917)
3.	feature z_win_mean	(0.13210159169068286)
4.	feature y_win_min	(0.08917659279525886)
5.	feature y_win_max	(0.08777851991931403)
6.	feature z_win_max	(0.07056818296147796)
7.	feature y_win_mean	(0.0651225194739667)
8.	feature x_win_mean	(0.06453739264870412)
9.	feature x_win_max	(0.05647963016703458)
10.	feature x_win_min	(0.05226429953748055)
11.	feature x_win_median	(0.029694156601550774)
12.	feature y_win_median	(0.015236585842503944)
13.	feature x_acceleration	(0.01371711304327312)
14.	feature z_acceleration	(0.011091846627122903)
15.	feature y_acceleration	(0.003224839800244727)
16.	feature y2	(0.0003772810231072705)
17.	feature z2	(0.00031019098495536126)
18.	feature x2	(0.00028207272495125465)
