In [1]:
import config
import pandas as pd
import glob

In [2]:
# modules used for easier display of data
from IPython.display import display
from IPython.core.display import HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Notes:

- Previous Kata we explored the generation of features manually and successfully increased accuracy from 78% to 85% using RandomForests Classifier
    - We only experimented this on one participant from the dataset
- Today we will focus on extending this previous concept out to all participants on the dataset and compare results

### Accomplished:

- Explored the performance of RandomForests using an ensemble of 100, on each separate dataset of participant
- Cross-validated the results with 5-fold

### Load Data

In [3]:
path_to_files = glob.glob(config.data_dir+'*.csv')
dfs = []
for i,f in enumerate(path_to_files):
    df_i = pd.read_csv(f,names=config.cols)
    df_i['participant_id'] = i
    dfs.append(df_i)
print("number of participants", len(dfs))

number of participants 15


### Feature Engineering

- from kata_04272017
- Seems like data contains Unknown "labels" marked as 0, we will remove these for now

In [4]:
def create_features(df_i, window_size=int(config.sampling_freq) ):
    df_i['x_win_mean'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).mean()
    df_i['y_win_mean'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).mean()
    df_i['z_win_mean'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).mean()

    df_i['x_win_median'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).median()
    df_i['y_win_median'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).median()
    df_i['z_win_median'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).median()

    df_i['x_win_max'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).max()
    df_i['y_win_max'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).max()
    df_i['z_win_max'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).max()

    df_i['x_win_min'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).min()
    df_i['y_win_min'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).min()
    df_i['z_win_min'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).min()

    df_i = df_i.dropna()
    return df_i

In [5]:
# compute features for each participant
dfs_processed = [create_features(df_i,window_size=int(config.sampling_freq*3.0)) for df_i in dfs]

#  preprocess data to remove instances with zero as a label
dfs_processed = [ df_i[df_i['label']>0] for df_i in dfs_processed]

### ML with cross-validation

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [15]:
clf_rf = RandomForestClassifier(n_jobs=-1,n_estimators=100,random_state=2)

In [11]:
def train_and_score(clf,X,y):
    clf_score = cross_val_score(clf,X,y,cv=5,n_jobs=-1)
    return clf_score

##### Create a model and calculate score for each participant separately

In [None]:
# create a list of (X,y) tuples
X_cols = ['x_acceleration','y_acceleration','z_acceleration',
        'x_win_mean','y_win_mean','z_win_mean',
        'x_win_median','y_win_median','z_win_median',
        'x_win_max','y_win_max','z_win_max',
        'x_win_min','y_win_min','z_win_min']

y_cols = 'label'
data = [(df_i[X_cols],df_i[y_cols]) for df_i in dfs_processed]

In [67]:
%%time
for i,d in enumerate(data):
    X,y = d[0],d[1]
    clf_score = train_and_score(clf_rf,X,y)
    print("Participant {}\t".format(i),"Accuracy: %0.2f (+/- %0.2f)" % (clf_score.mean(), clf_score.std() * 2))

Participant 0	 Accuracy: 0.86 (+/- 0.13)
Participant 1	 Accuracy: 0.73 (+/- 0.34)
Participant 2	 Accuracy: 0.73 (+/- 0.36)
Participant 3	 Accuracy: 0.74 (+/- 0.18)
Participant 4	 Accuracy: 0.72 (+/- 0.33)
Participant 5	 Accuracy: 0.76 (+/- 0.22)
Participant 6	 Accuracy: 0.88 (+/- 0.11)
Participant 7	 Accuracy: 0.75 (+/- 0.19)
Participant 8	 Accuracy: 0.67 (+/- 0.25)
Participant 9	 Accuracy: 0.84 (+/- 0.24)
Participant 10	 Accuracy: 0.80 (+/- 0.32)
Participant 11	 Accuracy: 0.90 (+/- 0.13)
Participant 12	 Accuracy: 0.87 (+/- 0.11)
Participant 13	 Accuracy: 0.74 (+/- 0.29)
Participant 14	 Accuracy: 0.71 (+/- 0.33)
CPU times: user 4.03 s, sys: 1.17 s, total: 5.2 s
Wall time: 9min 34s


##### Run with combined dataset

In [7]:
# create a list of (X,y) tuples
X_cols = ['x_acceleration','y_acceleration','z_acceleration',
        'x_win_mean','y_win_mean','z_win_mean',
        'x_win_median','y_win_median','z_win_median',
        'x_win_max','y_win_max','z_win_max',
        'x_win_min','y_win_min','z_win_min']

y_cols = 'label'
df_concat = pd.concat(dfs_processed)
data = (df_concat[X_cols],df_concat[y_cols])

In [16]:
%%time
X,y = data[0],data[1]
clf_score = train_and_score(clf_rf,X,y)
print("Participant {}\t".format(i),"Accuracy: %0.2f (+/- %0.2f)" % (clf_score.mean(), clf_score.std() * 2))

Participant 14	 Accuracy: 0.44 (+/- 0.05)
CPU times: user 3.56 s, sys: 180 ms, total: 3.74 s
Wall time: 5min 22s
