In [24]:
import numpy as np
import pandas as pd
import sklearn.cross_validation as cv
import sklearn.metrics as metrics

from sklearn.ensemble import RandomForestClassifier

In [2]:
# Read feature metadata.
activities       = pd.read_csv('ucihar/clean_labels.csv', index_col=0)
readings         = pd.read_csv('ucihar/cleaned_features.csv', index_col=0)
column_names     = readings['reading'].values

In [3]:
# Read training data.
X_train           = pd.read_csv('ucihar/train/X_train.csv', header=None, names=column_names)
y_train           = pd.read_table('ucihar/train/y_train.txt', header=None, names=['activity'])
subj_train        = pd.read_table('ucihar/train/subject_train.txt', header=None, names=['subject'])
X_train.index    += 1 # match up the index with readings
y_train.index    += 1 # match up the index with readings
subj_train.index += 1 # match up the index with readings

# activities_train = y_train.join(activities, on='activity', lsuffix='_number', rsuffix='_desc')
# df_train         = X_train.join(activities_train)

In [22]:
# Read testing data.
X_test           = pd.read_csv('ucihar/test/X_test.csv', header=None, names=column_names)
y_test           = pd.read_table('ucihar/test/y_test.txt', header=None, names=['activity'])
subj_test        = pd.read_table('ucihar/train/subject_train.txt', header=None, names=['subject'])
X_test.index    += 1 # match up the index with readings
y_test.index    += 1 # match up the index with readings
subj_test.index += 1 # match up the index with readings

# activities_test = y_test.join(activities, on='activity', lsuffix='_L', rsuffix='_R')
# df_test         = X_test.join(activities_test)

In [8]:
clf = RandomForestClassifier(n_estimators=500)
clf = clf.fit(X_train, y_train['activity'])

In [20]:
(len(clf.feature_importances_), len(column_names))

(561, 561)

In [19]:
importance_data = { 'name': column_names, 'importance': clf.feature_importances_ }
importances     = pd.DataFrame(importance_data, columns=['name', 'importance'])
importances.sort_values(by='importance', ascending=False).head(10)

Unnamed: 0,name,importance
40,tGravityAcc_mean_X,0.031922
56,tGravityAcc_energy_X,0.030986
52,tGravityAcc_min_X,0.02941
558,angle(X_gravityMean),0.029327
41,tGravityAcc_mean_Y,0.026968
559,angle(Y_gravityMean),0.024744
50,tGravityAcc_max_Y,0.02292
53,tGravityAcc_min_Y,0.022398
49,tGravityAcc_max_X,0.021746
57,tGravityAcc_energy_Y,0.017397


In [23]:
clf.score(X_test, y_test['activity'])

0.92840176450627754

In [30]:
precision = cv.cross_val_score(clf, X_test, y_test['activity'], scoring='precision_weighted', cv=10)
recall    = cv.cross_val_score(clf, X_test, y_test['activity'], scoring='recall_weighted', cv=10)
'precision {} recall {}'.format(precision, recall)

'precision [ 0.93581159  0.94565779  0.88905973  0.90296581  0.98328983  0.93299219\n  0.93915961  0.9277514   0.92619662  0.9835007 ] recall [ 0.9295302   0.93243243  0.89830508  0.88813559  0.98644068  0.93559322\n  0.91496599  0.90784983  0.90784983  0.98634812]'