In [71]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
from sklearn import metrics

In [51]:
# Jupyter viewing options
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.set_option('display.max_columns', 500)

In [4]:
df = pd.read_csv("W207_Group_Project/ob_data_w207_filtered.csv")

In [28]:
print("Column count:", len(df.columns))
print("Column names:", df.columns)
print()
print("clean label:",df.exercise_clean.unique())
print("clean label2:", df.exercise_clean2.unique())
print("dataset size:", len(df))

Column count: 45
Column names: Index(['_id', 'deleted', 'endTime', 'exercise', 'initialStartTime', 'metric',
       'removed_set', 'rpe', 'setID', 'setNumber', 'startTime', 'tags',
       'userID', 'videoFileURL', 'videoType', 'weight', 'workoutID',
       'RepCount', 'isValid', 'removed_rep', 'hardware', 'appVersion',
       'deviceName', 'deviceIdentifier', 'time', 'StartMessg', 'RepN',
       'AvgVel', 'ROM', 'PeakVel', 'PeakVelLoc', 'StartData', 'RepDur',
       'TimeBWReps', 'TimeRepComp', 'TimeRepWait', 'SlowAllow', 'Backlight',
       'MinAllow', 'PeakAccel', '_merge', 'rpe_num', 'weight_lbs',
       'exercise_clean', 'exercise_clean2'],
      dtype='object')

clean label: ['bench' 'squat other' 'squat' 'deadlift' 'bench other' 'deadlift other']
clean label2: ['bench' 'bench other' 'squat other' 'squat' 'deadlift' 'deadlift other']
dataset size: 292910


In [48]:
df.head(1)

Unnamed: 0,_id,deleted,endTime,exercise,initialStartTime,metric,removed_set,rpe,setID,setNumber,startTime,tags,userID,videoFileURL,videoType,weight,workoutID,RepCount,isValid,removed_rep,hardware,appVersion,deviceName,deviceIdentifier,time,StartMessg,RepN,AvgVel,ROM,PeakVel,PeakVelLoc,StartData,RepDur,TimeBWReps,TimeRepComp,TimeRepWait,SlowAllow,Backlight,MinAllow,PeakAccel,_merge,rpe_num,weight_lbs,exercise_clean,exercise_clean2
0,000051e0-54b2-48d8-98d2-79f65505c3e6,,,Bench,2018-01-03 01:29:46.904,kgs,0.0,7.5,000051e0-54b2-48d8-98d2-79f65505c3e6,3.0,,[],5a2e998b05cbc9b8ef26ac7e,assets-library://asset/asset.mov?id=CFFFFAC8-0...,lift,87.5,5bac5444-7c37-49d0-be8e-32be1a09fbb1,0.0,True,False,ios,3.2.3,OB 6666,709A15B7-B43C-AF0E-953A-6D4CEFEB4C83,2018-01-03 01:29:46.902,-3456.0,11.0,0.25846,252.0,0.308408,37.0,,978699.0,0.0,1752134000.0,0.0,130000.0,10000.0,150000.0,11.07361,both,7.5,192.90425,bench,bench


In [58]:
df['rpe'] = pd.to_numeric(df['rpe'], errors='coerce')  # imports as a string

# subset of columns that are likely to be interesting / predictive
features = df[['rpe', 'RepCount', 'AvgVel', 'ROM', 'PeakVel', 'PeakVelLoc', 'RepDur', 'TimeBWReps', 'TimeRepComp', 'TimeRepWait', 'PeakAccel', 'weight_lbs']]
labels = df['exercise_clean']

In [59]:
train_data = features[:200000].copy()
train_labels = labels[:200000].copy()
test_data = features[200000:].copy()
test_labels = labels[200000:].copy()

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_labels.shape)

(200000, 12)
(200000,)
(92910, 12)
(92910,)


In [61]:
# too many NaN
# fill with column mean
# move this to the pipeline?   sklearn Imputer

train_data.fillna(train_data.mean(),inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

In [72]:
pipeline = Pipeline(steps=[
    ('scale', preprocessing.StandardScaler()),
    ('PCA', PCA()),
    ('randomforest', RandomForestClassifier())])

parameters = {
    "PCA__n_components":[2,4,6],
    "randomforest__n_estimators":[2,4,6,8],
}

# scoring method - F1
f1 = metrics.make_scorer(metrics.f1_score, average="micro")

In [69]:
pipeline.fit(train_data, train_labels)
pipeline.score(test_data, test_labels)

0.7324507587988376

In [74]:
grid_search = GridSearchCV(pipeline, parameters, scoring=f1)
grid_search.fit(train_data, train_labels)
grid_search.score(test_data, test_labels)

0.6693897319987084