In [14]:
import db.knowhere_db as kdb
import pipeline.pipeline as pipeline
import pandas as pd
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from mlxtend.classifier import StackingClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [15]:
reader = kdb.Reader('knowhere')

In [131]:
glen_H_data_raw = reader.get_dataframe_pivoted(collection='iphone_test3', username='glen', commute=True,\
                                               min_date='2017-3-20', max_date='2017-03-23')

In [132]:
glen_3_20_morning = [("7:33:22", "D"),("7:41:55", "W"),("7:49:01", "D"),("7:52:45","W"),("7:55:45", "S_D"),
                    ("7:55:53", "P_W"),("7:56:53", "P_S"),("8:00:23", "T_B"), ("8:16:15", "T_T"),("8:21:05", "T_B"),
                    ("8:30:30", "T_E"),("8:31:15", "S_U"),("8:31:20","W"),("8:32:05", "S_U"),("8:32:10","W"),
                    ("8:32:35", "S_U"),("8:32:50", "W"),("8:36:40", "E_U"),("8:37:50", "W"),("8:30:10", "W")]

In [133]:
glen_3_20_evening = [("20:05:55","W"),("20:06:30","E_D"),("20:07:05","W"),("20:13:00","S"),("20:27:25","T_D"),
                    ("20:36:50","T_S"),("20:38:14","T_D"),("20:44:10","T_S"),("20:44:10","T_S"),("20:45:15","T_D"),
                    ("20:46:51","T_S"),("20:47:50","T_D"),("20:49:15","T_S"),("20:50:50","T_D"),("20:51:15","T_S"),
                     ("20:54:04","T_D"),("20:53:59","T_E"),("20:54:00","W"),("20:55:05","W"),("20:57:05","I_C"),
                     ("21:08:45","W")]

In [134]:
glen_3_21_morning = [("7:33:10",  "W"),("7:34:15","I_C"),("7:48:15", "W"),("7:50:30","I_C"),
                     ("7:55:30",  "W"),("8:01:05","T_D"),("8:17:15","T_S"),("8:20:15","T_D"),
                     ("8:28:20","T_E"),("8:28:20",  "W"),("8:35:30","E_U"),("8:35:50", "W")]

In [135]:
glen_3_21_evening = [("19:00:00", "W"), ("19:08:05", "S"), ("19:15:27", "T_D"), ("19:24:05", "T_S"),
                    ("19:24:50", "T_D"), ("19:32:39", "T_S"), ("19:33:49", "T_D"), ("19:35:28", "T_S"),
                    ("19:36:10", "T_D"), ("19:37:44", "T_S"), ("19:38:38", "T_D"), ("19:39:59", "T_S"),
                    ("19:40:36", "T_D"), ("19:42:21", "T_S"), ("19:42:26", "W"), ("19:48:27", "IC"),
                     ("19:59:55", "W")]

In [136]:
glen_3_22_morning = [("7:40:20", "D"),("7:41:55", "IC"),("7:52:52", "W"),("7:55:18","S"),("8:01:20", "T_D"),
                    ("8:14:00", "T_T"),("8:17:00", "T_D"),("8:25:59", "T_E"), ("8:26:04", "W"),("8:33:27", "E_U"),
                    ("8:34:31", "W")]

In [137]:
glen_3_22_evening = [("20:29:15", "W"),("20:32:40", "E_D"),("20:33:15","W"),("20:49:16","T_D"),("20:58:37","T_S"),
                    ("20:59:05","T_D"),("21:06:08","T_S"),("21:07:03","T_D"),("21:08:50","T_S"),("21:09:28","T_D"),
                    ("21:10:48","T_S"),("21:11:21","T_D"),("21:12:50","T_S"),("21:13:24","T_D"),("21:15:09","T_S"),
                    ("21:16:35", "C"), ("21:32:45", "W")]

In [138]:
glen_3_23_morning = [("07:22:41","W"), ("8:01:15","T_D"), ("8:13:31", "T_S"), ("8:17:44", "T_D"),
                    ("8:27:46", "T_E"), ("8:27:51", "W"), ("8:33:29", "E_U"), ("8:34:05", "W")]

In [139]:
glen_commute_labels = {'2017-3-20': glen_3_20_morning + glen_3_20_evening, '2017-3-21': glen_3_21_morning +\
                       glen_3_21_evening, '2017-3-22': glen_3_22_morning + glen_3_22_evening, '2017-3-23':\
                       glen_3_23_morning}

In [144]:
def add_acceleration_magnitude(df):
    "Adds Acceleration magnitude to a dataframe. First 3 columns must be acceleration vectors. Must be 3D."
    from math import sqrt

    accel_mag = []
    
    for row in range(len(df)):
        accel_mag.append(sqrt(df.iloc[row, 0]**2 + df.iloc[row, 1]**2 + df.iloc[row, 2]**2))

    df['accel_mag'] = accel_mag
    return 'finished!'

def rel_time_to_real_time(df, day, start_time):
    from pandas.tseries.offsets import DateOffset
    df.index = [pd.Timestamp(day + " " + start_time) + DateOffset(seconds=x) for x in df.index]
    return "Finished!"

def add_classifications(df, day, events):    
    '''Pass events as a list of tuples (datetime, eventcode) to classify different events'''
    for days_events in events.iterkeys():
        if pd.to_datetime(day) == pd.to_datetime(days_events):
            for event in events[days_events]:
                df.loc[day + " " + event[0], 'classification'] = event[1]

    df = df.fillna(method='ffill', inplace = True)
    
    return "Finished!"

def label_raw_iphone(df, day, events, username, accel_mag = False):
    df.index = pd.to_datetime(df.index)
    if accel_mag == True:
        add_acceleration_magnitude(df)
    df['classification'] = None
    add_classifications(df, day, events)
    df.to_csv("Labeled_" + username + day)
    return df

In [150]:
label_raw_iphone(glen_H_data_raw, "2017-03-22", glen_commute_labels , username = 'glen')

Unnamed: 0,GPS Horizontal Accuracy,GPS Longitude,GPS Vertical Accuracy,Altimeter (Barometer) Pressure,GPS Latitude,GPS Altitude,Altimeter (Barometer) Relative Altitude,Magnetometer z,Magnetometer y,Magnetometer x,...,Gravity x,Gyrometer y,Gyrometer z,Acceleration z,Acceleration x,Acceleration y,Gyrometer x,Microphone Left Channel Level,Microphone Right Channel Level,classification
2017-03-20 07:33:32,65.0,-73.753674,10.0,,40.736696,49.925198,,,,,...,,,,,,,,,,
2017-03-20 07:33:33,65.0,-73.753620,10.0,101.280930,40.736642,49.617031,0.000000,-448.443573,138.553955,-53.052246,...,0.069859,-0.212767,0.489268,-1.372254,-0.152821,0.563072,-1.204461,,,
2017-03-20 07:33:34,65.0,-73.753620,10.0,101.281799,40.736642,49.617031,-0.072419,-448.443573,138.553955,-53.052246,...,0.069859,-0.212767,0.489268,-1.372254,-0.152821,0.563072,-1.204461,,,
2017-03-20 07:33:35,65.0,-73.753620,10.0,101.282677,40.736642,49.617031,-0.145844,-448.443573,138.553955,-53.052246,...,0.069859,-0.212767,0.489268,-1.372254,-0.152821,0.563072,-1.204461,,,
2017-03-20 07:33:36,65.0,-73.753620,10.0,101.284218,40.736642,49.617031,-0.274084,-448.443573,138.553955,-53.052246,...,0.069859,-0.212767,0.489268,-1.372254,-0.152821,0.563072,-1.204461,,,
2017-03-20 07:33:37,50.0,-73.753751,8.0,101.285233,40.736369,45.059113,-0.358571,-448.443573,138.553955,-53.052246,...,-0.240759,-0.212767,0.489268,-0.714468,0.377050,0.415137,-1.204461,,,
2017-03-20 07:33:38,10.0,-73.753784,3.0,101.286613,40.736889,48.031158,-0.473734,-448.443573,138.553955,-53.052246,...,-0.240759,-0.004173,-0.025536,-0.714468,0.377050,0.415137,-0.010636,-2.53365,-2.53365,
2017-03-20 07:33:39,10.0,-73.753771,3.0,101.286613,40.736817,48.538971,-0.473734,-448.443573,138.553955,-53.052246,...,-0.240759,-0.004173,-0.025536,-0.714468,0.377050,0.415137,-0.010636,-2.53365,-2.53365,
2017-03-20 07:33:40,10.0,-73.753745,3.0,101.287239,40.736894,48.434418,-0.525531,-439.615295,118.981277,-50.648590,...,-0.240759,-0.004173,-0.025536,-0.714468,0.377050,0.415137,-0.010636,-2.53365,-2.53365,
2017-03-20 07:33:41,10.0,-73.753743,3.0,101.287621,40.736893,48.664825,-0.556710,-439.615295,118.981277,-50.648590,...,-0.240759,-0.004173,-0.025536,-0.714468,0.377050,0.415137,-0.010636,-2.53365,-2.53365,


In [124]:
glen_H_data = glen_H_data_raw[['Acceleration x','Acceleration y','Acceleration z', 'Altimeter (Barometer) Pressure',\
                               'Microphone Left Channel Level', 'Microphone Right Channel Level','Magnetometer x',\
                               'Magnetometer y','Magnetometer z','Gyrometer x','Gyrometer y',\
                               'Gyrometer z', 'classification']]
glen_H_data = glen_H_data.dropna()
glen_H_data.iloc[:,0:11].astype(float)


Unnamed: 0,Acceleration x,Acceleration y,Acceleration z,Altimeter (Barometer) Pressure,Microphone Left Channel Level,Microphone Right Channel Level,Magnetometer x,Magnetometer y,Magnetometer z,Gyrometer x,Gyrometer y


In [21]:
glen_H_data.groupby("classification").size()

classification
D       559
E_U      70
P_S     210
P_W      60
S_D       8
S_U      25
T_B    1494
T_E      45
T_T     290
W      4725
dtype: int64

In [22]:
glen_H_data['Acceleration'] =  np.sqrt(glen_H_data['Acceleration x']**2 + glen_H_data['Acceleration y']**2 +\
                                       glen_H_data['Acceleration z']**2)
glen_H_data['Magnetometer'] =  np.sqrt(glen_H_data['Magnetometer x']**2 + glen_H_data['Magnetometer y']**2 +\
                                       glen_H_data['Magnetometer z']**2)
glen_H_data['Gyrometer']    =  np.sqrt(glen_H_data['Gyrometer x']**2 + glen_H_data['Gyrometer y']**2 +\
                                       glen_H_data['Gyrometer z']**2)
glen_H_data['Microphone']    =  (glen_H_data['Microphone Left Channel Level'] +\
                                glen_H_data['Microphone Right Channel Level'])/2
glen_H_data = glen_H_data[['Acceleration','Magnetometer','Gyrometer','Microphone','Altimeter (Barometer) Pressure', 'classification']]

In [23]:
# set window
window = 5
# Rolling Means
glen_H_data['RollingMeanAcceleration'] = pd.rolling_mean(glen_H_data['Acceleration'], window)
glen_H_data['RollingMeanMagnetometer'] = pd.rolling_mean(glen_H_data['Magnetometer'], window)
glen_H_data['RollingMeanGyrometer'] = pd.rolling_mean(glen_H_data['Gyrometer'], window)
glen_H_data['RollingMeanMicrophone'] = pd.rolling_mean(glen_H_data['Microphone'], window)
glen_H_data['RollingMeanAltimeter'] = pd.rolling_mean(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling 75th percentile
glen_H_data['Rolling75thAcceleration'] = pd.rolling_quantile(glen_H_data['Acceleration'], window, 0.75)
glen_H_data['Rolling75thMagnetometer'] = pd.rolling_quantile(glen_H_data['Magnetometer'], window, 0.75)
glen_H_data['Rolling75thGyrometer'] = pd.rolling_quantile(glen_H_data['Gyrometer'], window, 0.75)
glen_H_data['Rolling75thMicrophone'] = pd.rolling_quantile(glen_H_data['Microphone'], window, 0.75)
glen_H_data['Rolling75thAltimeter'] = pd.rolling_quantile(glen_H_data['Altimeter (Barometer) Pressure'], window, 0.75)
# Rolling Max
glen_H_data['RollingMaxAcceleration'] = pd.rolling_max(glen_H_data['Acceleration'], window)
glen_H_data['RollingMaxMagnetometer'] = pd.rolling_max(glen_H_data['Magnetometer'], window)
glen_H_data['RollingMaxGyrometer'] = pd.rolling_max(glen_H_data['Gyrometer'], window)
glen_H_data['RollingMaxMicrophone'] = pd.rolling_max(glen_H_data['Microphone'], window)
glen_H_data['RollingMaxAltimeter'] = pd.rolling_max(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling Min
glen_H_data['RollingMinAcceleration'] = pd.rolling_min(glen_H_data['Acceleration'], window)
glen_H_data['RollingMinMagnetometer'] = pd.rolling_min(glen_H_data['Magnetometer'], window)
glen_H_data['RollingMinGyrometer'] = pd.rolling_min(glen_H_data['Gyrometer'], window)
glen_H_data['RollingMinMicrophone'] = pd.rolling_min(glen_H_data['Microphone'], window)
glen_H_data['RollingMinAltimeter'] = pd.rolling_min(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling st dev
glen_H_data['RollingSDAcceleration'] = pd.rolling_std(glen_H_data['Acceleration'], window)
glen_H_data['RollingSDMagnetometer'] = pd.rolling_std(glen_H_data['Magnetometer'], window)
glen_H_data['RollingSDGyrometer'] = pd.rolling_std(glen_H_data['Gyrometer'], window)
glen_H_data['RollingSDMicrophone'] = pd.rolling_std(glen_H_data['Microphone'], window)
glen_H_data['RollingSDAltimeter'] = pd.rolling_std(glen_H_data['Altimeter (Barometer) Pressure'], window)
# Rolling var
glen_H_data['RollingVarAcceleration'] = pd.rolling_var(glen_H_data['Acceleration'], window)
glen_H_data['RollingVarMagnetometer'] = pd.rolling_var(glen_H_data['Magnetometer'], window)
glen_H_data['RollingVarGyrometer'] = pd.rolling_var(glen_H_data['Gyrometer'], window)
glen_H_data['RollingVarMicrophone'] = pd.rolling_var(glen_H_data['Microphone'], window)
glen_H_data['RollingVarAltimeter'] = pd.rolling_var(glen_H_data['Altimeter (Barometer) Pressure'], window)

	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).quantile(quantile=0.75)
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).max()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Series.rolling(window=5,center=False).min()
	Seri

In [24]:
glen_H_data = glen_H_data.dropna()
glen_H_data_re = glen_H_data.iloc[:,range(0,5) + range(6,36) + [5]]

In [25]:
# load dataset
dataset = glen_H_data_re.values
X = dataset[:,0:34]
Y = dataset[:,35]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# One Hot Encode
#lb = LabelBinarizer()
#lb.fit(encoded_Y)
#dummy_y = lb.fit_transform(encoded_Y)

In [26]:
# build a classifier
RF_Class = RandomForestClassifier(n_estimators=200)
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [2, 3, 2, 10, 13, 27, None],
              "max_features": sp_randint(1,35),
              "min_samples_split": sp_randint(2, 100),
              "min_samples_leaf": sp_randint(1, 100),
              "bootstrap": [True],
              "criterion": ["gini"]}

# run randomized search
n_iter_search = 5
random_search = RandomizedSearchCV(RF_Class, param_distributions=param_dist, n_iter=n_iter_search)

start = time()
random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 61.09 seconds for 5 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.713 (std: 0.130)
Parameters: {'bootstrap': True, 'min_samples_leaf': 14, 'min_samples_split': 82, 'criterion': 'gini', 'max_features': 7, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.711 (std: 0.138)
Parameters: {'bootstrap': True, 'min_samples_leaf': 54, 'min_samples_split': 94, 'criterion': 'gini', 'max_features': 19, 'max_depth': 27}

Model with rank: 3
Mean validation score: 0.695 (std: 0.106)
Parameters: {'bootstrap': True, 'min_samples_leaf': 63, 'min_samples_split': 43, 'criterion': 'gini', 'max_features': 34, 'max_depth': 2}



In [27]:
# CV the Random Firest Model with the best paramerters
Rand_Forest = RandomForestClassifier(n_estimators=1000, bootstrap=True,min_samples_leaf=83,min_samples_split=11,\
                                     criterion='gini', max_features=18,max_depth=27)
scores = cross_val_score(Rand_Forest, X, Y, cv=2)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.69 (+/- 0.27)


In [28]:
GBM = GradientBoostingClassifier(n_estimators=1000)
scores = cross_val_score(GBM, X, Y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.79 (+/- 0.43)


In [29]:
KNN = KNeighborsClassifier(n_neighbors=3)
GNB = GaussianNB()
lr = LogisticRegression()
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [30]:
stacked_class = StackingCVClassifier(classifiers=[GBM, Rand_Forest], 
#                                     use_probas=True,
                                     meta_classifier=lr)
print('5-fold cross validation:\n')

for clf, label in zip([GBM, Rand_Forest, stacked_class], 
                      ['GBM', 
                       'Rand_Forest',
                       'StackingClassifier']):


    scores = model_selection.cross_val_score(clf, X, encoded_Y, cv=5, scoring='neg_log_loss')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))


5-fold cross validation:

Accuracy: -1.39 (+/- 1.77) [GBM]
Accuracy: -1.46 (+/- 2.12) [Rand_Forest]
Accuracy: -2.29 (+/- 2.26) [StackingClassifier]


In [31]:
x_train,x_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.33, random_state=41010392)
lr.fit(x_train, y_train)
Y_pred = lr.predict(x_test)
print np.unique(Labels)
confusion_matrix(y_test,Y_pred)

NameError: name 'Labels' is not defined

In [None]:
x_train,x_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.33, random_state=41010392)
NN.fit(x_train, y_train)
Y_pred = NN.predict(x_test)
print np.unique(Labels)
confusion_matrix(y_test,Y_pred)

In [None]:
x_train,x_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.33, random_state=54535353)
Rand_Forest.fit(x_train, y_train)
Y_pred = Rand_Forest.predict(x_test)
print np.unique(Labels)
confusion_matrix(y_test,Y_pred)

In [213]:
x_train,x_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.33, random_state=41010392)
GBM.fit(x_train, y_train)
Y_pred = GBM.predict(x_test)
print np.unique(Labels)
confusion_matrix(y_test,Y_pred)

['D' 'E_U' 'P_S' 'P_W' 'S_D' 'S_U' 'T_B' 'T_E' 'T_T' 'W']


array([[ 181,    0,    0,    0,    0,    0,    0,    0,    0,    8],
       [   0,   21,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,   71,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,   18,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    2,    0,    0,    0,    0,    1],
       [   0,    0,    0,    0,    0,    7,    0,    0,    0,    1],
       [   0,    0,    1,    0,    0,    0,  490,    0,    0,    1],
       [   0,    0,    0,    0,    0,    1,    0,   11,    0,    2],
       [   0,    0,    0,    0,    0,    0,    1,    0,   90,    0],
       [   3,    0,    0,    0,    0,    0,    2,    0,    0, 1558]])

In [194]:
Labels =encoder.inverse_transform(encoded_Y)
print np.unique(Labels)
confusion_matrix(encoded_Y,Y_pred)

['D' 'E_U' 'P_S' 'P_W' 'S_D' 'S_U' 'T_B' 'T_E' 'T_T' 'W']


array([[ 555,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,   70,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,  210,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,   60,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    8,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,   25,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 1494,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,   44,    0,    1],
       [   0,    0,    0,    0,    0,    0,    0,    0,  290,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 4725]])