In [32]:
import config 
import glob
import pandas as pd

In [34]:
# modules used for easier display of data
from IPython.display import display
from IPython.core.display import HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [20]:
# pandas display formatting
pd.options.display.float_format = '{:g}'.format

### Load Data

In [2]:
!ls {config.data_dir}

1.csv  11.csv 13.csv 15.csv 3.csv  5.csv  7.csv  9.csv
10.csv 12.csv 14.csv 2.csv  4.csv  6.csv  8.csv  README


In [50]:
data_files = glob.glob(config.data_dir+'*.csv')
print("Number of files: ",len(data_files))
dfs = pd.concat([pd.read_csv(f,names=config.cols) for f in data_files])
print(dfs.shape)

Number of files:  15
(1926896, 5)


In [38]:
dfs.head(3)

Unnamed: 0,sequential_number,x_acceleration,y_acceleration,z_acceleration,label
0,0,1502,2215,2153,1
1,1,1667,2072,2047,1
2,2,1611,1957,1906,1


### Preprocess Data

In [51]:
dfs[['x_acceleration','y_acceleration','z_acceleration']].describe()

Unnamed: 0,x_acceleration,y_acceleration,z_acceleration
count,1926900.0,1926900.0,1926900.0
mean,1987.65,2382.52,1970.6
std,111.358,100.315,94.4589
min,282.0,2.0,1.0
25%,1904.0,2337.0,1918.0
50%,1992.0,2367.0,1988.0
75%,2076.0,2413.0,2032.0
max,3828.0,4095.0,4095.0


#### Notes:
- Sensor values range between 1-4095, which correspond to a 12-bit sensor reading, from lowest to highest. 

- We will create the following features:
    - rate-of-change in each direction

In [52]:
#  compute rate-of-change for each degree-of-freedom
dfs['x2'] = dfs['x_acceleration'].diff()
dfs['y2'] = dfs['y_acceleration'].diff()
dfs['z2'] = dfs['z_acceleration'].diff()
dfs['x3'] = dfs['x2'].diff()
dfs['y3'] = dfs['y2'].diff()
dfs['z3'] = dfs['z2'].diff()
display(dfs.head(2))
print(dfs.shape[0])

Unnamed: 0,sequential_number,x_acceleration,y_acceleration,z_acceleration,label,x2,y2,z2,x3,y3,z3
0,0,1502,2215,2153,1,,,,,,
1,1,1667,2072,2047,1,165.0,-143.0,-106.0,,,


1926896


In [53]:
# we will remove the first row since it contains nan, after our previous calc
data = dfs.copy().dropna()
X = data[['x_acceleration','y_acceleration','z_acceleration','x2','y2','z2','x3','y3','z3']]
y = data['label']

In [54]:
print(data.shape)
data.head(2)

(1926894, 11)


Unnamed: 0,sequential_number,x_acceleration,y_acceleration,z_acceleration,label,x2,y2,z2,x3,y3,z3
2,2,1611,1957,1906,1,-56,-115,-141,-221,28,-35
3,3,1601,1939,1831,1,-10,-18,-75,46,97,66


### ML: AdaBoost

In [40]:
from sklearn.ensemble import  AdaBoostClassifier

In [41]:
# default estimator is DecisionTreeClassifier for AdaBoost
clf = AdaBoostClassifier(base_estimator=None,n_estimators=50,
                         learning_rate=1.0,algorithm='SAMME.R',
                        random_state=None)

In [55]:
%%time
clf.fit(X,y)

CPU times: user 2min 46s, sys: 9.06 s, total: 2min 55s
Wall time: 3min


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [46]:
# This may be overfitting
clf.score(X,y)

0.61168356345312014

#### Cross-Validation

In [56]:
from sklearn.model_selection import cross_val_score

In [57]:
scores = cross_val_score(clf, X, y, cv=5, n_jobs =-1)
scores

array([ 0.18256479,  0.52384264,  0.51150034,  0.55083723,  0.29644891])

In [58]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.41 (+/- 0.29)


### ML: RandomForest

In [59]:
from sklearn.ensemble import  RandomForestClassifier

In [67]:
# default estimator is DecisionTreeClassifier for AdaBoost
clf_rf = RandomForestClassifier(n_jobs=-1,n_estimators=10,random_state=2)

In [68]:
%%time
clf_rf.fit(X,y)

CPU times: user 2min 13s, sys: 2.48 s, total: 2min 16s
Wall time: 43.3 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=2,
            verbose=0, warm_start=False)

In [69]:
# This may be overfitting
clf_rf.score(X,y)

0.98984842964895836

#### Cross-Validation

In [70]:
from sklearn.model_selection import cross_val_score

In [71]:
rf_scores = cross_val_score(clf_rf, X, y, cv=5, n_jobs =-1)
rf_scores

array([ 0.3034763 ,  0.39947481,  0.3040246 ,  0.48200593,  0.29639442])

In [72]:
print("Accuracy: %0.2f (+/- %0.2f)" % (rf_scores.mean(), rf_scores.std() * 2))

Accuracy: 0.36 (+/- 0.15)


In [73]:
clf_rf.feature_importances_

array([ 0.19284607,  0.15819454,  0.2262735 ,  0.08118543,  0.0995663 ,
        0.07141578,  0.05470633,  0.06203515,  0.05377688])

In [79]:
import numpy as np

print("Feature ranking:")
importances = clf_rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 2 (0.226274)
2. feature 0 (0.192846)
3. feature 1 (0.158195)
4. feature 4 (0.099566)
5. feature 3 (0.081185)
6. feature 5 (0.071416)
7. feature 7 (0.062035)
8. feature 6 (0.054706)
9. feature 8 (0.053777)


#### Notes:
- seems like rate-of-change of acceleration is highly ranked as a feature, but any further rate-of-changes calcs do not significantly affect the model