## Actitracker labeled dataset NN model
This model will use keras to predict activity from sessionized accelerometer data, via the actitracker dataset (http://www.cis.fordham.edu/wisdm/dataset.php). 

### Load libraries

In [109]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

### Data load

In [139]:
DATA_FOLDER = 'Data/WISDM_ar_v1.1/'
DATA_FILE = 'raw_data_fixed.txt'
actitracker = pd.read_csv(
    DATA_FOLDER+DATA_FILE ,
    sep=',' ,
    lineterminator=';' ,
    header=None ,
)
actitracker.columns = [
    'user' ,
    'activity' ,
    'timestamp' ,
    'x-accel' ,
    'y-accel' ,
    'z-accel' ,
    'NA' ,
]
del actitracker['NA']
accel_cols = ['x-accel','y-accel','z-accel']

### Sessionize data

In [140]:
# re-calculate time in seconds
actitracker['time_seconds'] = actitracker['timestamp']*10e-9

# sort by user and time 
actitracker = actitracker.sort_values(by=['user','time_seconds'])
actitracker['seq'] = xrange(actitracker.shape[0])
actitracker['session'] = actitracker.groupby(['user','activity'])['seq'].apply(lambda x: x%500 == 0).fillna(0).cumsum()

### Gather labels

In [141]:
# get session_labels 
ohe = OneHotEncoder(sparse=False); le = LabelEncoder()
labels = actitracker.groupby(['user','session'])['activity'].apply(lambda x: max(x))
ohe_labels = ohe.fit_transform(le.fit_transform(labels).reshape(-1,1))
Y = pd.DataFrame(ohe_labels,index=labels.index)

### Calculate features 

In [142]:
# group by user and session
accel_cols = ['x-accel','y-accel','z-accel']
g = actitracker.loc[:,accel_cols+['user','session']].groupby(['user','session'])

# IQR function
def iqr(x):
    ''' calculate IQR from array
    '''
    q75, q25 = np.percentile(x, [75,25])
    return q75-q25

# calculate model cols 
means = g[accel_cols].apply(lambda x: np.mean(x))
sds = g[accel_cols].apply(lambda x: np.std(x))
median_1 = g[accel_cols[0]].apply(lambda x: np.median(x))
median_2 = g[accel_cols[1]].apply(lambda x: np.median(x))
median_3 = g[accel_cols[2]].apply(lambda x: np.median(x))
iqr_1 = g[accel_cols[0]].apply(lambda x: iqr(x))
iqr_2 = g[accel_cols[1]].apply(lambda x: iqr(x))
iqr_3 = g[accel_cols[2]].apply(lambda x: iqr(x))
mins = g[accel_cols].apply(lambda x: np.min(x))
maxs = g[accel_cols].apply(lambda x: np.max(x))
kurtosis_1 = g[accel_cols[0]].apply(lambda x: sp.stats.kurtosis(x))
kurtosis_2 = g[accel_cols[1]].apply(lambda x: sp.stats.kurtosis(x))
kurtosis_3 = g[accel_cols[2]].apply(lambda x: sp.stats.kurtosis(x))
skew_1 = g[accel_cols[0]].apply(lambda x: sp.stats.skew(x))
skew_2 = g[accel_cols[1]].apply(lambda x: sp.stats.skew(x))
skew_3 = g[accel_cols[2]].apply(lambda x: sp.stats.skew(x))
percentiles = []
for i in range(10,100,10):
    for e in range(1,4):
        percentiles.append(eval('g[accel_cols['+str(e-1)+']].apply(lambda x: sp.percentile(x,'+str(i)+'))'))

# concat columns
X = pd.concat([means,
               sds,
               median_1,
               median_2,
               median_3,
               iqr_1,
               iqr_2,
               iqr_3,
               mins,
               maxs,
               kurtosis_1,
               kurtosis_2,
               kurtosis_3,
               skew_1,
               skew_2,
               skew_3,
              ]+percentiles
              ,axis=1)

### MLP Neural Network Model

In [132]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

#### Compile model

In [147]:
model = Sequential()

indim = X.shape[1]
model.add(Dense(512*8, input_dim=indim, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512*12, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

#### Execute model

In [148]:
X_train, X_test, Y_train, Y_test = train_test_split(X.as_matrix(), Y.as_matrix(), test_size=0.33, random_state=22)
model.fit(X_train, Y_train,
          nb_epoch=20,
          batch_size=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x110b74b90>

In [149]:
predictions = model.predict(X_test)
print 'Accuracy: {}'.format(accuracy_score(np.argmax(Y_test, axis=1), np.argmax(predictions,axis=1)) )
print 'Log-loss: {}'.format(log_loss(Y_test, predictions))

Accuracy: 0.841248303935
Log-loss: 0.597517882918
