In [1]:
import config
import pandas as pd
import glob

#### Notes:

- Successfully improved accuracy by 10%, from 86% to 98%
- Overall Accuracy using one participant's data to train and tested on rest, accuracy jump from 44% to 96%

#### Load Data

In [90]:
data_files = glob.glob(config.data_dir+'*.csv')
dfs=[]
for f in data_files:
    df_i = pd.read_csv(f,sep=',',names=config.cols)
    dfs.append(df_i)
df_train = dfs[0].drop('sequential_number',axis=1)
df_test = dfs
print(df_train.shape)

(166741, 4)


#### Feature Engineering

Compute the following

In [113]:
def create_features(df_i, window_size=int(config.sampling_freq) ):
    df_i['x_win_mean'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).mean()
    df_i['y_win_mean'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).mean()
    df_i['z_win_mean'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).mean()

    df_i['x_win_median'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).median()
    df_i['y_win_median'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).median()
    df_i['z_win_median'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).median()

    df_i['x_win_max'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).max()
    df_i['y_win_max'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).max()
    df_i['z_win_max'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).max()

    df_i['x_win_min'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).min()
    df_i['y_win_min'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).min()
    df_i['z_win_min'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).min()
    
    df_i['x_win_var'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).var()
    df_i['y_win_var'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).var()
    df_i['z_win_var'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).var()
    
    df_i['x_win_std'] = df_i[['x_acceleration']].rolling(window_size,min_periods=1).std()
    df_i['y_win_std'] = df_i[['y_acceleration']].rolling(window_size,min_periods=1).std()
    df_i['z_win_std'] = df_i[['z_acceleration']].rolling(window_size,min_periods=1).std()
    
    df_i['prev_label'] = df_i['label'].shift(-1).fillna(0.0).astype(int)
    df_i['prev_prev_label'] = df_i['label'].shift(-2).fillna(0.0).astype(int)
    df_i['prev_prev_prev_label'] = df_i['label'].shift(-3).fillna(0.0).astype(int)
    df_i = df_i.dropna()
        
    return df_i

In [114]:
# remove zero labels
df_p = df_train[df_train['label']>0]
print('number of rows removed: ',df_train.shape[0]-df_p.shape[0])
df_p = create_features(df_train.copy())
print(df_p.shape)

number of rows removed:  3001
(166740, 25)


In [132]:
df_p.head(5)

Unnamed: 0,sequential_number,x_acceleration,y_acceleration,z_acceleration,label,x_win_mean,y_win_mean,z_win_mean,x_win_median,y_win_median,...,z_win_min,x_win_var,y_win_var,z_win_var,x_win_std,y_win_std,z_win_std,prev_label,prev_prev_label,prev_prev_prev_label
1,1.0,2045,2039,1666,1,2051.0,2041.0,1665.5,2051.0,2041.0,...,1665.0,72.0,8.0,0.5,8.485281,2.828427,0.707107,1,1,1
2,2.0,2042,2036,1662,1,2048.0,2039.333333,1664.333333,2045.0,2039.0,...,1662.0,63.0,12.333333,4.333333,7.937254,3.511885,2.081666,1,1,1
3,3.0,2045,2042,1660,1,2047.25,2040.0,1663.25,2045.0,2040.5,...,1660.0,44.25,10.0,7.583333,6.652067,3.162278,2.753785,1,1,1
4,4.0,2043,2037,1660,1,2046.4,2039.4,1662.6,2045.0,2039.0,...,1660.0,36.8,9.3,7.8,6.0663,3.04959,2.792848,1,1,1
5,5.0,2045,2036,1661,1,2046.166667,2038.833333,1662.333333,2045.0,2038.0,...,1660.0,29.766667,9.366667,6.666667,5.455884,3.060501,2.581989,1,1,1


In [116]:
X_cols = ['x_acceleration','y_acceleration','z_acceleration',
        'x_win_mean','y_win_mean','z_win_mean',
        'x_win_median','y_win_median','z_win_median',
        'x_win_max','y_win_max','z_win_max',
        'x_win_min','y_win_min','z_win_min',
        'x_win_var','y_win_var','z_win_var',
        'x_win_std','y_win_std','z_win_std',
         'prev_label','prev_prev_label','prev_prev_label']
X,y = df_p[X_cols], df_p['label']
print(X.shape,y.shape)

(166740, 24) (166740,)


#### Training and CV

In [117]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [124]:
clf = RandomForestClassifier(n_jobs=-1,n_estimators=200,random_state=2)

In [125]:
clf_score = cross_val_score(clf,X,y,cv=5,n_jobs=-1)
print("Accuracy: %0.2f (+/- %0.2f)" % (clf_score.mean(), clf_score.std() * 2))

Accuracy: 0.98 (+/- 0.09)


#### Test Data

In [126]:
clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=False, random_state=2,
            verbose=0, warm_start=False)

In [127]:
# Process test data
df_test_p = []
for d in df_test:
    # remove zero labels
    df_p = d[d['label']>0]
#     print('number of rows containing 0 label: ',d.shape[0]-df_p.shape[0])
    df_p = create_features(d.copy())
    df_test_p.append(df_p)

In [128]:
scores = []
for i,d in enumerate(df_test_p):
    X_test_i,y_test_i = d[X_cols], d['label']
    score_i = (i,clf.score(X_test_i,y_test_i))
    scores.append(score_i)
    print("Participant",i,"\t Accuracy: %0.2f " % (score_i[1]))

Participant 0 	 Accuracy: 1.00 
Participant 1 	 Accuracy: 0.93 
Participant 2 	 Accuracy: 0.91 
Participant 3 	 Accuracy: 0.95 
Participant 4 	 Accuracy: 0.98 
Participant 5 	 Accuracy: 0.92 
Participant 6 	 Accuracy: 0.97 
Participant 7 	 Accuracy: 0.96 
Participant 8 	 Accuracy: 0.98 
Participant 9 	 Accuracy: 0.95 
Participant 10 	 Accuracy: 0.99 
Participant 11 	 Accuracy: 0.98 
Participant 12 	 Accuracy: 1.00 
Participant 13 	 Accuracy: 0.98 
Participant 14 	 Accuracy: 0.97 


In [129]:
# Mean Accuracy (ignoring the first one)
def compute_mean(a): return sum(a)/len(a)
vals = [v[1] for v in scores if v[0]!=0]
print(compute_mean(vals)) 

0.961934111028
