In [8]:
import db.knowhere_db as kdb
import pipeline.pipeline as pipeline
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

In [9]:
reader = kdb.Reader('knowhere')

In [13]:
glen_H_data_raw = reader.get_dataframe_pivoted(collection='iphone_test3', username='glen', commute=True)
glen_H_data_raw

Unnamed: 0,GPS Horizontal Accuracy,GPS Longitude,GPS Vertical Accuracy,Altimeter (Barometer) Pressure,GPS Latitude,GPS Altitude,Altimeter (Barometer) Relative Altitude,Magnetometer z,Magnetometer y,Magnetometer x,...,Gravity y,Gravity x,Gyrometer y,Gyrometer z,Acceleration z,Acceleration x,Acceleration y,Gyrometer x,Microphone Left Channel Level,Microphone Right Channel Level
2017-03-20 07:33:32,65.0,-73.753674,10.0,,40.736696,49.925198,,,,,...,,,,,,,,,,
2017-03-20 07:33:33,65.0,-73.753620,10.0,101.280930,40.736642,49.617031,0.000000,-448.443573,138.553955,-53.052246,...,-0.819343,0.069859,-0.212767,0.489268,-1.372254,-0.152821,0.563072,-1.204461,,
2017-03-20 07:33:34,,,,101.281799,,,-0.072419,,,,...,,,,,,,,,,
2017-03-20 07:33:35,,,,101.282677,,,-0.145844,,,,...,,,,,,,,,,
2017-03-20 07:33:36,,,,101.284218,,,-0.274084,,,,...,,,,,,,,,,
2017-03-20 07:33:37,50.0,-73.753751,8.0,101.285233,40.736369,45.059113,-0.358571,,,,...,-0.970466,-0.240759,,,-0.714468,0.377050,0.415137,,,
2017-03-20 07:33:38,10.0,-73.753784,3.0,101.286613,40.736889,48.031158,-0.473734,,,,...,,,-0.004173,-0.025536,,,,-0.010636,-2.53365,-2.53365
2017-03-20 07:33:39,10.0,-73.753771,3.0,,40.736817,48.538971,,,,,...,,,,,,,,,,
2017-03-20 07:33:40,10.0,-73.753745,3.0,101.287239,40.736894,48.434418,-0.525531,-439.615295,118.981277,-50.648590,...,,,,,,,,,,
2017-03-20 07:33:41,10.0,-73.753743,3.0,101.287621,40.736893,48.664825,-0.556710,,,,...,,,,,,,,,,


In [14]:
glen_H_data = glen_H_data_raw[['Acceleration x','Acceleration y','Acceleration z', 'Altimeter (Barometer) Pressure',\
                               'Microphone Left Channel Level', 'Microphone Right Channel Level','Magnetometer x',\
                               'Magnetometer y','Magnetometer z','Gyrometer x','Gyrometer y',\
                               'Gyrometer z']]
glen_H_data = glen_H_data.dropna().astype(float)

In [15]:
glen_H_data['Acceleration'] =  np.sqrt(glen_H_data['Acceleration x']**2 + glen_H_data['Acceleration y']**2 +\
                                       glen_H_data['Acceleration z']**2)
glen_H_data['Magnetometer'] =  np.sqrt(glen_H_data['Magnetometer x']**2 + glen_H_data['Magnetometer y']**2 +\
                                       glen_H_data['Magnetometer z']**2)
glen_H_data['Gyrometer']    =  np.sqrt(glen_H_data['Gyrometer x']**2 + glen_H_data['Gyrometer y']**2 +\
                                       glen_H_data['Gyrometer z']**2)
glen_H_data['Microphone']    =  (glen_H_data['Microphone Left Channel Level'] +\
                                glen_H_data['Microphone Right Channel Level'])/2
glen_H_data = glen_H_data[['Acceleration','Magnetometer','Gyrometer','Microphone','Altimeter (Barometer) Pressure']]

In [16]:
# set window
window = 5
# Rolling Means
glen_H_data['RollingMeanAcceleration'] = pd.rolling_mean(glen_H_data['Acceleration'], window)
glen_H_data['RollingMeanMagnetometer'] = pd.rolling_mean(glen_H_data['Magnetometer'], window)
glen_H_data['RollingMeanGyrometer'] = pd.rolling_mean(glen_H_data['Gyrometer'], window)
glen_H_data['RollingMeanMicrophone'] = pd.rolling_mean(glen_H_data['Microphone'], window)
glen_H_data['RollingMeanAltimeter'] = pd.rolling_mean(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling 75th percentile
glen_H_data['Rolling75thAcceleration'] = pd.rolling_quantile(glen_H_data['Acceleration'], window, 0.75)
glen_H_data['Rolling75thMagnetometer'] = pd.rolling_quantile(glen_H_data['Magnetometer'], window, 0.75)
glen_H_data['Rolling75thGyrometer'] = pd.rolling_quantile(glen_H_data['Gyrometer'], window, 0.75)
glen_H_data['Rolling75thMicrophone'] = pd.rolling_quantile(glen_H_data['Microphone'], window, 0.75)
glen_H_data['Rolling75thAltimeter'] = pd.rolling_quantile(glen_H_data['Altimeter (Barometer) Pressure'], window, 0.75)
# Rolling Max
glen_H_data['RollingMaxAcceleration'] = pd.rolling_max(glen_H_data['Acceleration'], window)
glen_H_data['RollingMaxMagnetometer'] = pd.rolling_max(glen_H_data['Magnetometer'], window)
glen_H_data['RollingMaxGyrometer'] = pd.rolling_max(glen_H_data['Gyrometer'], window)
glen_H_data['RollingMaxMicrophone'] = pd.rolling_max(glen_H_data['Microphone'], window)
glen_H_data['RollingMaxAltimeter'] = pd.rolling_max(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling Min
glen_H_data['RollingMinAcceleration'] = pd.rolling_min(glen_H_data['Acceleration'], window)
glen_H_data['RollingMinMagnetometer'] = pd.rolling_min(glen_H_data['Magnetometer'], window)
glen_H_data['RollingMinGyrometer'] = pd.rolling_min(glen_H_data['Gyrometer'], window)
glen_H_data['RollingMinMicrophone'] = pd.rolling_min(glen_H_data['Microphone'], window)
glen_H_data['RollingMinAltimeter'] = pd.rolling_min(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling st dev
glen_H_data['RollingSDAcceleration'] = pd.rolling_std(glen_H_data['Acceleration'], window)
glen_H_data['RollingSDMagnetometer'] = pd.rolling_std(glen_H_data['Magnetometer'], window)
glen_H_data['RollingSDGyrometer'] = pd.rolling_std(glen_H_data['Gyrometer'], window)
glen_H_data['RollingSDMicrophone'] = pd.rolling_std(glen_H_data['Microphone'], window)
glen_H_data['RollingSDAltimeter'] = pd.rolling_std(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling var
glen_H_data['RollingVarAcceleration'] = pd.rolling_var(glen_H_data['Acceleration'], window)
glen_H_data['RollingVarMagnetometer'] = pd.rolling_var(glen_H_data['Magnetometer'], window)
glen_H_data['RollingVarGyrometer'] = pd.rolling_var(glen_H_data['Gyrometer'], window)
glen_H_data['RollingVarMicrophone'] = pd.rolling_var(glen_H_data['Microphone'], window)
glen_H_data['RollingVarAltimeter'] = pd.rolling_var(glen_H_data['Altimeter (Barometer) Pressure'], window)

	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Seri

In [18]:
glen_H_data = glen_H_data.dropna()

In [8]:
# load dataset
dataset = dataframe.values
X = dataset[:,0:35].astype(float)
Y = dataset[:,36]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# One Hot Encode
lb = LabelBinarizer()
lb.fit(encoded_Y)
dummy_y = lb.fit_transform(encoded_Y)

In [None]:
# build a classifier
RF_Class = RandomForestClassifier(n_estimators=100)
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [2, 3, None],
              "max_features": sp_randint(1,3),
              "min_samples_split": sp_randint(2, 3),
              "min_samples_leaf": sp_randint(1, 3),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(RF_Class, param_distributions=param_dist, n_iter=n_iter_search)

start = time()
random_search.fit(X, dummy_y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

In [None]:
# logistic regression
logreg = sklearn.linear_model.ElasticNetCV(l1_ratio = [0, .25, .5, .75, 1.], cv=3)
logreg.fit(X, dummy_y)

In [None]:
# build a classifier
ET_Class = ExtraTreesClassifier(n_estimators=100)
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [2, 3, None],
              "max_features": sp_randint(1,3),
              "min_samples_split": sp_randint(2, 3),
              "min_samples_leaf": sp_randint(1, 3),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(ET_Class, param_distributions=param_dist, n_iter=n_iter_search)

start = time()
random_search.fit(X, dummy_y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

In [None]:
# Do the Voting Classifier
VCM = VotingClassifier(estimators=[('RF_Class', RF_Class), ('logreg', logreg), ('ET_Class', ET_Class)], voting='hard')
VCM.fit(X, dummy_y)
VCM.fit_transform(X, dummy_y)