Libraries required

In [80]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import pickle
import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.stats import randint, uniform

Functions block

In [81]:
def dispersion(s):
    return s.max() - s.min()


def fill_missing(x):
    """x is a group after group by `number`"""
    return x.reindex(
               list((x.name, v) for v in range(x.index[0][1], x.index[-1][1]+1))
           )


def build_feature(tele, tele_iqr, win_mean_std, pca_gyro, cluster_win):
    """
    Build 3 type of features from raw telematics data. 
        Type 1: Statistical summary of telematics data, including mean, 
                median and standard deviation. 
        Type 2: Count outlying driving behaviours based on telematics 
                readings. For example, the number of times a driver exceed 
                speed limit on highway (110 kmph). 
        Type 3: Sliding windows aggregated features. We slide over the 
                telematics using window size of 8 and compute the 
                corresponding statistical summary for these windows. Then, 
                we clusters these windows using K-means clustering algorithm. 
                Each of these clusters represent certain driving behaviour, such as 
                harsh braking and hard acceleration. The number of occurence 
                for each of these clusters (behaviour) is then used as
                feature for the trip.
    
    Prior to building features, we perform data cleaning and data transformation.
        Data Cleaning: 
            1. Remove observations with inaccurate GPS data, as suggested by 
               'Accuracy' feature.
            2. Remove observations with Speed = -1.
        Data Transformation:
            1. Transform triaxial accelerometer readings into one by finding
               the magnitue. 
                   (Magnitude = sqrt(acc_x ** 2 + acc_y ** 2 + acc_z ** 2)
            2. Transform the gyroscopre readings into its first principal 
               component using Principal Component Analysis (PCA).
    
    Parameters
    ----------
    tele: DataFrame, columns = ['bookingID', 'Accuracy', 'Bearing', 
                                'acceleration_x','acceleration_y', 
                                'acceleration_z', 'gyro_x', 'gyro_y',
                                'gyro_z', 'second', 'Speed']
        Telematics data in raw format (same as given by GRAB).
        
    tele_iqr: dictionary, keys = ['acceleration_z', 'acceleration_x', 
                                  'acceleration_y', 'gyro_y', 'Speed', 
                                  'second', 'gyro_z', 'gyro_x']
        The 25th and 75th percentile of telematics data. 
        Calculated from training data.
    
    win_mean_std: dictionary, keys = ['acceleration_std', 'Speed_median', 
                                      'acceleration_mean', 'gyro_median', 
                                      'acceleration_median', 'Speed_std', 
                                      'gyro_std', 'Speed_mean', 'gyro_mean']
        The mean and standard deviation of sliding window aggregated features. 
        Calculated from training data.
    
    pca_gyro: sklearn.decomposition.PCA model
        Pre-trained PCA model to transform triaxial gyroscope readings into 
        first principal component. Trained using training data.
        
    """
    # required column names
    COL_ACCE = ('acceleration_x', 'acceleration_y', 'acceleration_z')
    COL_GYRO = ('gyro_x', 'gyro_y', 'gyro_z')
    COL_TELE = ('bookingID', 'Accuracy', 'Bearing', 'second', 'Speed', 'acceleration_x', 'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z')
    
    bid = tele.bookingID.unique()
    
    #### STAGE 0: Data Validation ####
    if not sorted(tele.columns) == sorted(COL_TELE):
        raise Exception('Input columns mismatched! Expected: \n {}'.format(COL_TELE))
     
    # sort according to bookingID & seconds
    tele = tele.sort_values(['bookingID', 'second']).reset_index(drop=True)
    
    
    #### STAGE 1: Data Cleaning ####
    
    print("... (1/5) cleaning data ... ")
    # filter out inaccurate GPS data and speed = -1
    tele = tele.loc[(tele.Accuracy <= 16) & (tele.Speed != -1)]
    
    # drop 'Accuracy' & 'Bearing' to save memory. we don't need these anymore. 
    tele.drop(['Accuracy', 'Bearing'], axis=1, inplace=True)
    
    
    #### STAGE 2: Data Transformation ####
    
    print("... (2/5) transforming data ... ")
    # calculate magnitude of acceleration sqrt(acc_x^2 + acc_y^2 + acc_z^2)
    tele['acceleration'] = np.sqrt((tele.loc[:, COL_ACCE] ** 2).sum(axis=1))
    
    # transform triaxial gyro readings into its first principal components
    tele['gyro'] = pca_gyro.transform(tele.loc[:, COL_GYRO])
    
    
    #### STAGE 3A: Generating Feature (Type 1: Statistical Description) ####
    
    print("... (3/5) generating feature (Type 1: Statistical Description) ... ")
    feature1 = tele.groupby('bookingID')['acceleration', 'gyro', 'Speed', 'second'].agg(['mean', 'median', 'std', dispersion]).fillna(0)
    feature1.columns = ['_'.join(col) for col in feature1.columns] # rename columns
    feature1.reset_index(inplace=True)
    
    
    #### STAGE 3B: Generating Feature (Type 2: Detecting Outlying Behaviours) ####
    
    print("... (4/5) generating feature (Type 2: Counting Outlying Behaviours) ... ")
    feature2 = pd.DataFrame()
    
    # use 75th percentile only
    feature2['over_Speed'] = tele.groupby('bookingID')['Speed'].apply(lambda x: sum(x > tele_iqr['Speed'][1]))
    feature2['over_second'] = tele.groupby('bookingID')['second'].apply(lambda x: sum(x > tele_iqr['second'][1]))
    
    # use 25th and 75th percentile
    for col in (COL_ACCE + COL_GYRO):
        feature2['over_{}'.format(col)] = tele.groupby('bookingID')[col].apply(lambda x: sum((x < tele_iqr[col][0]) | (x > tele_iqr[col][1])))
    
    feature2.reset_index(inplace=True)
    
    
    #### STAGE 3C: Generating Feature (Type 3: Sliding Window)
    
    print("... (5/5) generating feature (Type 3: Sliding Window) ... ")
    print("    (WARNING! This process may take up many RAM memory. Please allocate enough memory.)")
    print('    Side Note: This may take awhile, please be patient. :)')
    
    # groupby object
    agg_win_feat = tele.loc[:, ['bookingID', 'Speed', 'acceleration', 'gyro']].groupby('bookingID')
    
    # calculate aggregate features for rolling windows of size 8, overlapped 50% 
    agg_win_feat = agg_win_feat.rolling(8).agg(['mean', 'median', 'std']).dropna()[::4]
    
    # minor adjustments towards output rows and columns
    agg_win_feat = agg_win_feat.drop('bookingID', axis=1)

    agg_win_feat.columns = ['_'.join(col) for col in agg_win_feat.columns]
    
    # standardize the data before clustering algorithm
    agg_win_feat = agg_win_feat.apply(lambda x: (x - win_mean_std[x.name][0]) / (win_mean_std[x.name][1]))
    
    # cluster into different groups (different driving behaviour, e.g: harsh braking, hard acceleration)
    agg_win_feat['cluster'] = cluster_win.predict(agg_win_feat)
    agg_win_feat = agg_win_feat.droplevel(1).reset_index()
    
    # count the occurrence of each actions during a trip
    feature3 = pd.crosstab(agg_win_feat.bookingID, agg_win_feat.cluster)
    feature3.reset_index(inplace=True)
    
    # handle missing clusters
    exp_clust = set(range(cluster_win.n_clusters)) # expected clusters
    out_clust = set(feature3.columns) # outputed clusters
    for col in exp_clust - out_clust:
        feature3[col] = 0

    feature3.columns = feature3.columns.astype(str)
    
    # join all 3 features
    output = pd.DataFrame(bid, columns=['bookingID'])
    output = output.merge(feature1, how='left', on='bookingID')
    output = output.merge(feature2, how='left', on='bookingID')
    output = output.merge(feature3, how='left', on='bookingID')
    output = output.fillna(0)
    
    print('Done!')
    
    return output

In [82]:
#Uncomment below for full dataset
#file_0 = pd.read_csv("safety/features/part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
file_1 = pd.read_csv("safety/features/part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_2 = pd.read_csv("safety/features/part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_3 = pd.read_csv("safety/features/part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_4 = pd.read_csv("safety/features/part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_5 = pd.read_csv("safety/features/part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_6 = pd.read_csv("safety/features/part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_7 = pd.read_csv("safety/features/part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_8 = pd.read_csv("safety/features/part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#file_9 = pd.read_csv("safety/features/part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
#labels_0 = pd.read_csv("C:/Users/tanji/Desktop/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv")

In [83]:
labels_0 = pd.read_csv("safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv")
#Test for duplicated rows
#test1 = labels_0[labels_0.duplicated(['bookingID'])]
#test1 = test1.sort_values(by=['label', 'bookingID'])
#test1.size
#test1
labels_0 = labels_0.groupby('bookingID', group_keys=False, as_index=False).apply(lambda x: x.loc[x.label.idxmax()])

In [84]:
labels_0.head()

Unnamed: 0,bookingID,label
0,0,0
1,1,1
2,2,1
3,4,1
4,6,0


In [85]:
# Uncomment line below for full dataset.

frames = [file_1]
#frames = [file_0, file_1, file_2, file_3, file_4, file_5, file_6, file_7, file_8, file_9]
main_df = pd.concat(frames)

In [86]:
bookingList = main_df['bookingID'].unique()

In [87]:
bookingListTrain, bookingListTest = train_test_split(bookingList, test_size=0.33)
train_df = main_df[main_df['bookingID'].isin(bookingListTrain)]
test_df = main_df[main_df['bookingID'].isin(bookingListTest)]
train_df = train_df.merge(labels_0, on='bookingID')

In [88]:
print(len(bookingListTrain))
print(len(bookingListTest))

13400
6600


In [89]:
#print(train_df.size)
train_df.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
0,1194000908341,4.0,203.0,1.001973,9.745,-0.790087,-0.066046,0.019175,0.044741,246.0,6.48,1
1,1194000908341,6.836,0.0,0.307056,9.586384,-0.613514,-0.007457,-0.003196,-0.007457,52.0,0.0,1
2,1194000908341,3.0,0.0,0.171185,9.595362,-0.685939,-0.007457,-0.001065,-0.005326,1048.0,0.0,1
3,1194000908341,4.0,72.0,-0.128688,9.509171,-1.723825,0.015979,0.006392,-0.028762,422.0,5.48,1
4,1194000908341,3.9,203.0,0.466869,8.41083,-0.347758,-0.045806,-0.073503,0.04048,237.0,10.02,1


In [90]:
main_df = train_df

In [91]:
main_df = main_df.sort_values(by=['bookingID', 'second'])
dangerous = main_df[main_df['label'] == 1]
safe = main_df[main_df['label'] == 0]
print(len(dangerous))
print(len(safe))
print(len(main_df))

331191
748938
1080129


In [92]:
print(len(train_df['bookingID'].unique()))
print(len(test_df['bookingID'].unique()))

13400
6600


In [93]:
smallBookingList = main_df['bookingID'].value_counts().index.tolist()
safeList =safe['bookingID'].value_counts().index.tolist()
dangerList =dangerous['bookingID'].value_counts().index.tolist()


In [94]:
print(len(smallBookingList))
print(len(safeList))
print(len(dangerList))

13400
10043
3357


In [97]:
smallBookingList = smallBookingList[0:200]
#testList = [400:600]
safeList = safeList[0:200]
dangerList = dangerList[0:200]

In [98]:
main_df = main_df[main_df['bookingID'].isin(smallBookingList)]
safe200 = safe[safe['bookingID'].isin(safeList)]
dangerous200 = dangerous[dangerous['bookingID'].isin(dangerList)]
#testList = main_df[main_df['bookingID'].isin(testList)]

In [99]:
#testList.head()

In [100]:
safe200.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
194078,25769803839,16.0,116.175636,0.489133,-9.549826,1.786807,-0.075622,0.065748,-0.001761,3.0,0.017811,0
194121,25769803839,16.0,109.492393,1.443771,-9.830655,1.352554,-0.10916,0.237674,-0.067216,4.0,0.017811,0
194027,25769803839,12.0,291.326538,-0.680688,-9.173145,0.985593,-0.050735,-0.069789,0.008725,27.0,4.323752,0
194043,25769803839,12.0,347.419647,-3.348563,-8.741284,2.1759,-0.014571,-0.50322,0.056381,30.0,3.489704,0
194132,25769803839,7.0,297.993225,0.394327,-10.064081,2.204462,-0.001866,0.113382,0.017042,45.0,6.063122,0


In [101]:
main_df = main_df.sort_values(by=['bookingID', 'second'])
safe200 = safe200.sort_values(by=['bookingID', 'second'])
dangerous200 = dangerous200.sort_values(by=['bookingID', 'second'])
#testList = testList.sort_values(by=['bookingID', 'second'])
#main_df.head()

In [102]:
safe200.head()


Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
194078,25769803839,16.0,116.175636,0.489133,-9.549826,1.786807,-0.075622,0.065748,-0.001761,3.0,0.017811,0
194121,25769803839,16.0,109.492393,1.443771,-9.830655,1.352554,-0.10916,0.237674,-0.067216,4.0,0.017811,0
194027,25769803839,12.0,291.326538,-0.680688,-9.173145,0.985593,-0.050735,-0.069789,0.008725,27.0,4.323752,0
194043,25769803839,12.0,347.419647,-3.348563,-8.741284,2.1759,-0.014571,-0.50322,0.056381,30.0,3.489704,0
194132,25769803839,7.0,297.993225,0.394327,-10.064081,2.204462,-0.001866,0.113382,0.017042,45.0,6.063122,0


In [103]:
dangerous200.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
270410,86,4.0,132.0,-0.551049,10.039137,-3.290045,-0.100479,0.030107,-0.069516,1.0,9.802209,1
270452,86,4.0,123.0,0.521993,8.86366,-2.473794,-0.147655,0.215513,-0.201865,4.0,10.088593,1
270429,86,4.0,115.0,-0.158563,8.702761,-3.572101,-0.166574,0.049148,-0.060877,7.0,10.954933,1
270449,86,4.0,109.0,-0.832311,9.695693,-0.464092,-0.043354,0.036565,0.00576,12.0,6.876282,1
270447,86,4.0,95.0,-0.100844,9.631854,-1.727302,0.011589,0.00075,0.023248,17.0,5.716552,1


Create a temp dataframe with the required columns for appending of the data during the groupby function below.

## For safe

In [109]:
# Single run for a single booking id
main_array = []
run = safe200[safe200['bookingID']== 25769803839]
df = run.reset_index().set_index('second')
s = 1
a = np.arange(df.index.min(),df.index.max() + s, step=s)
df = df.reindex(a, tolerance=s/2., method='nearest')
df = df.interpolate(method='linear')
df.head()
agg_win_feat = df[['Bearing', 'acceleration_x', 'acceleration_y', 
                   'acceleration_z', 'gyro_x', 'gyro_y',
                   'gyro_z', 'Speed']].rolling(8).agg(['mean', 'median', 'std'], as_index=False).reset_index().dropna()[::4]

agg_win_feat.columns = ["_".join(x) for x in agg_win_feat.columns.ravel()]
windows_df = agg_win_feat
windows_df['bookingID'] = '25769803867'
windows_df = windows_df.iloc[0:0]
#print(windows_df.columns)
agg_win_feat.head(10)

Unnamed: 0,second_,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,gyro_y_mean,gyro_y_median,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,Speed_mean,Speed_median,Speed_std,bookingID
7,10.0,131.080609,129.256974,16.919795,1.081976,1.120484,0.302505,-9.720509,-9.730599,0.089584,...,0.181092,0.190886,0.05373,-0.050367,-0.05566,0.02072,0.50925,0.485848,0.423826,25769803867
11,14.0,160.880304,160.880304,19.365255,0.843381,0.843381,0.226254,-9.644837,-9.644837,0.070025,...,0.150782,0.150782,0.032745,-0.045754,-0.045754,0.008088,1.234708,1.234708,0.458581,25769803867
15,18.0,192.503633,192.503633,19.365255,0.473909,0.473909,0.226254,-9.530487,-9.530487,0.070025,...,0.09731,0.09731,0.032745,-0.032547,-0.032547,0.008088,1.983567,1.983567,0.458581,25769803867
19,22.0,224.126963,224.126963,19.365255,0.104438,0.104438,0.226254,-9.416137,-9.416137,0.070025,...,0.043839,0.043839,0.032745,-0.01934,-0.01934,0.008088,2.732426,2.732426,0.458581,25769803867
23,26.0,255.750292,255.750292,19.365255,-0.265033,-0.265033,0.226254,-9.301788,-9.301788,0.070025,...,-0.009633,-0.009633,0.032745,-0.006133,-0.006133,0.008088,3.481285,3.481285,0.458581,25769803867
27,30.0,295.467525,287.373622,30.781118,-1.232197,-0.634505,1.127222,-9.100914,-9.187438,0.197446,...,-0.161437,-0.063105,0.181422,0.016511,0.007074,0.021957,3.881221,3.858521,0.284584,25769803867
31,34.0,329.277501,335.886815,19.224565,-2.369687,-2.529628,0.868007,-8.959482,-8.96175,0.142698,...,-0.343479,-0.369321,0.140807,0.041188,0.047202,0.016444,3.912668,3.918607,0.276176,25769803867
35,38.0,332.591721,332.591721,8.071301,-2.225696,-2.225696,0.611211,-9.138123,-9.138123,0.216012,...,-0.31824,-0.31824,0.100691,0.044579,0.044579,0.006424,4.261729,4.261729,0.420237,25769803867
39,42.0,319.411341,319.411341,8.071301,-1.227592,-1.227592,0.611211,-9.490869,-9.490869,0.216012,...,-0.153812,-0.153812,0.100691,0.034089,0.034089,0.006424,4.947974,4.947974,0.420237,25769803867
43,46.0,306.637316,306.230962,7.466933,-0.266402,-0.229488,0.557488,-9.826755,-9.843615,0.19246,...,0.004609,0.010615,0.091935,0.023672,0.023598,0.006306,5.634949,5.634219,0.421433,25769803867


In [110]:
groups = safe200.groupby('bookingID')
#main_array = []
#windows_df = pd.DataFrame()
for name, group in groups:
    #run = test_df[test_df['bookingID']== name]
    df = group.reset_index().set_index('second')
    #reindex floatindex
    s = 1
    a = np.arange(df.index.min(),df.index.max() + s, step=s)
    df = df.reindex(a, tolerance=s/2., method='nearest')
    df = df.interpolate(method='linear')
    df.head()
    agg_win_feat = df[['Bearing', 'acceleration_x', 'acceleration_y', 
                       'acceleration_z', 'gyro_x', 'gyro_y',
                       'gyro_z', 'Speed']].rolling(8).agg(['mean', 'median', 'std'], as_index=False).reset_index().dropna()[::4]
    #print(agg_win_feat.columns)
    #agg_win_feat.columns = agg_win_feat.columns.droplevel(0)
    agg_win_feat.columns = ["_".join(x) for x in agg_win_feat.columns.ravel()]
    agg_win_feat['bookingID'] = name
    windows_df = windows_df.append(agg_win_feat)

    #print(agg_win_feat.columns)
    #agg_win_feat.head(10)
    #for row in agg_win_feat.iterrows():
        #main_array.append((name,row))


In [111]:
# Reordering of the columns
#windows_df.columns = windows_df[['bookingID','second_', 'Bearing_mean', 'Bearing_median', 'Bearing_std',
#       'acceleration_x_mean', 'acceleration_x_median', 'acceleration_x_std',
#       'acceleration_y_mean', 'acceleration_y_median', 'acceleration_y_std',
#       'acceleration_z_mean', 'acceleration_z_median', 'acceleration_z_std',
#       'gyro_x_mean', 'gyro_x_median', 'gyro_x_std', 'gyro_y_mean',
#       'gyro_y_median', 'gyro_y_std', 'gyro_z_mean', 'gyro_z_median',
#      'gyro_z_std', 'Speed_mean', 'Speed_median', 'Speed_std']]

safe_df = windows_df


In [112]:
safe_df.head()

Unnamed: 0,second_,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,gyro_y_mean,gyro_y_median,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,Speed_mean,Speed_median,Speed_std,bookingID
7,10.0,131.080609,129.256974,16.919795,1.081976,1.120484,0.302505,-9.720509,-9.730599,0.089584,...,0.181092,0.190886,0.05373,-0.050367,-0.05566,0.02072,0.50925,0.485848,0.423826,25769803839
11,14.0,160.880304,160.880304,19.365255,0.843381,0.843381,0.226254,-9.644837,-9.644837,0.070025,...,0.150782,0.150782,0.032745,-0.045754,-0.045754,0.008088,1.234708,1.234708,0.458581,25769803839
15,18.0,192.503633,192.503633,19.365255,0.473909,0.473909,0.226254,-9.530487,-9.530487,0.070025,...,0.09731,0.09731,0.032745,-0.032547,-0.032547,0.008088,1.983567,1.983567,0.458581,25769803839
19,22.0,224.126963,224.126963,19.365255,0.104438,0.104438,0.226254,-9.416137,-9.416137,0.070025,...,0.043839,0.043839,0.032745,-0.01934,-0.01934,0.008088,2.732426,2.732426,0.458581,25769803839
23,26.0,255.750292,255.750292,19.365255,-0.265033,-0.265033,0.226254,-9.301788,-9.301788,0.070025,...,-0.009633,-0.009633,0.032745,-0.006133,-0.006133,0.008088,3.481285,3.481285,0.458581,25769803839


## For test

In [159]:
# Single run for a single booking id
test_df = test_df.sort_values(by=['bookingID', 'second'])
main_array = []
run = test_df[test_df['bookingID']== 1099511627891]
df = run.reset_index().set_index('second')
s = 1
a = np.arange(df.index.min(),df.index.max() + s, step=s)
df = df.reindex(a, tolerance=s/2., method='nearest')
df = df.interpolate(method='linear')
df.head()
agg_win_feat = df[['Bearing', 'acceleration_x', 'acceleration_y', 
                   'acceleration_z', 'gyro_x', 'gyro_y',
                   'gyro_z', 'Speed']].rolling(8).agg(['mean', 'median', 'std'], as_index=False).reset_index().dropna()[::4]

agg_win_feat.columns = ["_".join(x) for x in agg_win_feat.columns.ravel()]
windows_df = agg_win_feat
windows_df['bookingID'] = '1099511627891'
windows_df = windows_df.iloc[0:0]
#print(windows_df.columns)
#agg_win_feat.head(10)

In [160]:
groups = test_df.groupby('bookingID')
#main_array = []
#windows_df = pd.DataFrame()
for name, group in groups:
    #run = test_df[test_df['bookingID']== name]
    df = group.reset_index().set_index('second')
    #reindex floatindex
    s = 1
    a = np.arange(df.index.min(),df.index.max() + s, step=s)
    df = df.reindex(a, tolerance=s/2., method='nearest')
    df = df.interpolate(method='linear')
    df.head()
    agg_win_feat = df[['Bearing', 'acceleration_x', 'acceleration_y', 
                       'acceleration_z', 'gyro_x', 'gyro_y',
                       'gyro_z', 'Speed']].rolling(8).agg(['mean', 'median', 'std'], as_index=False).reset_index().dropna()[::4]
    #print(agg_win_feat.columns)
    #agg_win_feat.columns = agg_win_feat.columns.droplevel(0)
    agg_win_feat.columns = ["_".join(x) for x in agg_win_feat.columns.ravel()]
    agg_win_feat['bookingID'] = name
    windows_df = windows_df.append(agg_win_feat)

    #print(agg_win_feat.columns)
    #agg_win_feat.head(10)
    #for row in agg_win_feat.iterrows():
        #main_array.append((name,row))


In [161]:
test_df = windows_df
#print(testList['bookingID'].unique())
test_df.head()

Unnamed: 0,second_,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,gyro_y_mean,gyro_y_median,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,Speed_mean,Speed_median,Speed_std,bookingID
7,20.0,301.5,301.5,9.448032,0.098165,0.098165,0.671933,9.901523,9.901523,0.096349,...,0.053264,0.053264,0.039973,-0.001596,-0.001596,0.009192,2.657388,2.657388,1.859784,2
11,24.0,288.942857,288.0,6.932057,0.488813,0.378124,0.319572,9.842496,9.854425,0.042995,...,0.013099,0.00803,0.016098,0.001935,0.002157,0.006092,0.803036,0.691088,0.832531,2
15,28.0,274.35,275.0,11.772001,0.237003,0.189148,0.174043,9.853459,9.853399,0.021016,...,0.00057,0.001555,0.0063,-0.006124,-0.006788,0.005493,1.635171,1.557306,1.410201,2
19,32.0,254.2,254.2,12.737347,0.097208,0.097208,0.056302,9.827541,9.827541,0.021113,...,-0.010663,-0.010663,0.007631,-0.01363,-0.01363,0.00419,4.048996,4.048996,1.525842,2
23,36.0,240.696196,237.934783,5.514542,0.290895,0.230808,0.268831,9.57696,9.626182,0.256053,...,-0.015193,-0.015397,0.005455,0.002728,0.002192,0.021292,6.085611,6.190788,1.061632,2


## For dangerous

In [114]:
# Single run for a single booking id
main_array = []
run = dangerous200[dangerous200['bookingID']== 86]
df = run.reset_index().set_index('second')
s = 1
a = np.arange(df.index.min(),df.index.max() + s, step=s)
df = df.reindex(a, tolerance=s/2., method='nearest')
df = df.interpolate(method='linear')
df.head()
agg_win_feat = df[['Bearing', 'acceleration_x', 'acceleration_y', 
                   'acceleration_z', 'gyro_x', 'gyro_y',
                   'gyro_z', 'Speed']].rolling(8).agg(['mean', 'median', 'std'], as_index=False).reset_index().dropna()[::4]

agg_win_feat.columns = ["_".join(x) for x in agg_win_feat.columns.ravel()]
windows_df = agg_win_feat
windows_df['bookingID'] = '8589934761'
windows_df = windows_df.iloc[0:0]
#print(windows_df.columns)
agg_win_feat.head(10)

Unnamed: 0,second_,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,gyro_y_mean,gyro_y_median,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,Speed_mean,Speed_median,Speed_std,bookingID
7,8.0,122.1,121.666667,6.586952,-0.01832,-0.045137,0.346572,9.122015,8.882504,0.486547,...,0.10646,0.098256,0.065364,-0.114241,-0.110753,0.054193,10.239908,10.113898,0.399014,8589934761
11,12.0,113.75,113.2,3.822033,-0.326149,-0.360687,0.383832,9.095223,9.000641,0.368623,...,0.065225,0.045373,0.04417,-0.053512,-0.040886,0.053955,9.317146,9.731338,1.490737,8589934761
15,16.0,106.4,107.6,5.362302,-0.548382,-0.551268,0.189485,9.530793,9.651006,0.220217,...,0.029499,0.032983,0.013247,0.000136,0.007508,0.019275,7.198147,6.760309,1.20125,8589934761
19,20.0,94.825,96.4,9.032442,-0.271764,-0.17399,0.242574,9.626744,9.638238,0.047406,...,0.021167,0.019137,0.014515,0.017246,0.017815,0.00455,6.059923,5.939401,0.325536,8589934761
23,24.0,77.85,77.85,12.0025,-0.045092,-0.045092,0.039018,9.533527,9.533527,0.068815,...,0.054244,0.054244,0.037438,0.015643,0.015643,0.005323,5.96593,5.96593,0.174528,8589934761
27,28.0,57.9875,58.25,12.445933,-0.032475,0.002695,0.126096,9.482487,9.449246,0.145072,...,0.113073,0.115379,0.03409,0.004993,0.006951,0.009646,6.33374,6.250933,0.364027,8589934761
31,32.0,36.4,35.5,13.612074,-0.47399,-0.530857,0.482229,9.730555,9.610304,0.565364,...,0.113229,0.1412,0.059454,-0.005691,-0.000262,0.029826,7.459481,7.529615,1.020892,8589934761
35,36.0,21.5625,19.65,4.93412,-0.617171,-0.666765,0.35399,9.507044,9.05424,0.693192,...,0.030733,-0.016949,0.077062,0.023108,0.049585,0.045711,8.34106,8.352774,0.214043,8589934761
39,40.0,16.85,16.85,1.714643,0.019464,0.019464,0.437901,9.067867,9.067867,0.033379,...,-0.030367,-0.030367,0.008217,0.062989,0.062989,0.008208,8.26107,8.26107,0.074876,8589934761
43,44.0,27.296875,15.45,25.031974,0.57788,0.584918,0.292667,9.103563,9.104106,0.02553,...,-0.037026,-0.037076,0.009124,0.067579,0.069691,0.012625,7.914927,8.138799,0.502426,8589934761


In [115]:
groups = dangerous200.groupby('bookingID')
#main_array = []
#windows_df = pd.DataFrame()
for name, group in groups:
    #run = test_df[test_df['bookingID']== name]
    df = group.reset_index().set_index('second')
    #reindex floatindex
    s = 1
    a = np.arange(df.index.min(),df.index.max() + s, step=s)
    df = df.reindex(a, tolerance=s/2., method='nearest')
    df = df.interpolate(method='linear')
    df.head()
    agg_win_feat = df[['Bearing', 'acceleration_x', 'acceleration_y', 
                       'acceleration_z', 'gyro_x', 'gyro_y',
                       'gyro_z', 'Speed']].rolling(8).agg(['mean', 'median', 'std'], as_index=False).reset_index().dropna()[::4]
    #print(agg_win_feat.columns)
    #agg_win_feat.columns = agg_win_feat.columns.droplevel(0)
    agg_win_feat.columns = ["_".join(x) for x in agg_win_feat.columns.ravel()]
    agg_win_feat['bookingID'] = name
    windows_df = windows_df.append(agg_win_feat)

    #print(agg_win_feat.columns)
    #agg_win_feat.head(10)
    #for row in agg_win_feat.iterrows():
        #main_array.append((name,row))


In [116]:
dangerous_df = windows_df
dangerous_df.head()

Unnamed: 0,second_,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,gyro_y_mean,gyro_y_median,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,Speed_mean,Speed_median,Speed_std,bookingID
7,8.0,122.1,121.666667,6.586952,-0.01832,-0.045137,0.346572,9.122015,8.882504,0.486547,...,0.10646,0.098256,0.065364,-0.114241,-0.110753,0.054193,10.239908,10.113898,0.399014,86
11,12.0,113.75,113.2,3.822033,-0.326149,-0.360687,0.383832,9.095223,9.000641,0.368623,...,0.065225,0.045373,0.04417,-0.053512,-0.040886,0.053955,9.317146,9.731338,1.490737,86
15,16.0,106.4,107.6,5.362302,-0.548382,-0.551268,0.189485,9.530793,9.651006,0.220217,...,0.029499,0.032983,0.013247,0.000136,0.007508,0.019275,7.198147,6.760309,1.20125,86
19,20.0,94.825,96.4,9.032442,-0.271764,-0.17399,0.242574,9.626744,9.638238,0.047406,...,0.021167,0.019137,0.014515,0.017246,0.017815,0.00455,6.059923,5.939401,0.325536,86
23,24.0,77.85,77.85,12.0025,-0.045092,-0.045092,0.039018,9.533527,9.533527,0.068815,...,0.054244,0.054244,0.037438,0.015643,0.015643,0.005323,5.96593,5.96593,0.174528,86


In [117]:
len(dangerous_df)
len(dangerous_df['bookingID'].unique())

200

In [118]:
window_frames = [safe_df, dangerous_df]
total_windows = pd.concat(window_frames)

In [119]:
len(total_windows['bookingID'].unique())


400

In [120]:
print(total_windows.columns)
total_windows.head()

Index(['second_', 'Bearing_mean', 'Bearing_median', 'Bearing_std',
       'acceleration_x_mean', 'acceleration_x_median', 'acceleration_x_std',
       'acceleration_y_mean', 'acceleration_y_median', 'acceleration_y_std',
       'acceleration_z_mean', 'acceleration_z_median', 'acceleration_z_std',
       'gyro_x_mean', 'gyro_x_median', 'gyro_x_std', 'gyro_y_mean',
       'gyro_y_median', 'gyro_y_std', 'gyro_z_mean', 'gyro_z_median',
       'gyro_z_std', 'Speed_mean', 'Speed_median', 'Speed_std', 'bookingID'],
      dtype='object')


Unnamed: 0,second_,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,gyro_y_mean,gyro_y_median,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,Speed_mean,Speed_median,Speed_std,bookingID
7,10.0,131.080609,129.256974,16.919795,1.081976,1.120484,0.302505,-9.720509,-9.730599,0.089584,...,0.181092,0.190886,0.05373,-0.050367,-0.05566,0.02072,0.50925,0.485848,0.423826,25769803839
11,14.0,160.880304,160.880304,19.365255,0.843381,0.843381,0.226254,-9.644837,-9.644837,0.070025,...,0.150782,0.150782,0.032745,-0.045754,-0.045754,0.008088,1.234708,1.234708,0.458581,25769803839
15,18.0,192.503633,192.503633,19.365255,0.473909,0.473909,0.226254,-9.530487,-9.530487,0.070025,...,0.09731,0.09731,0.032745,-0.032547,-0.032547,0.008088,1.983567,1.983567,0.458581,25769803839
19,22.0,224.126963,224.126963,19.365255,0.104438,0.104438,0.226254,-9.416137,-9.416137,0.070025,...,0.043839,0.043839,0.032745,-0.01934,-0.01934,0.008088,2.732426,2.732426,0.458581,25769803839
23,26.0,255.750292,255.750292,19.365255,-0.265033,-0.265033,0.226254,-9.301788,-9.301788,0.070025,...,-0.009633,-0.009633,0.032745,-0.006133,-0.006133,0.008088,3.481285,3.481285,0.458581,25769803839


In [121]:
X = []
norm_total_windows = (total_windows-total_windows.min())/(total_windows.max()-total_windows.min())
for index, row in norm_total_windows.iterrows():
    temp_arr = []
    element_list = ['Bearing_mean', 'Bearing_median', 'Bearing_std',
       'acceleration_x_mean', 'acceleration_x_median', 'acceleration_x_std',
       'acceleration_y_mean', 'acceleration_y_median', 'acceleration_y_std',
       'acceleration_z_mean', 'acceleration_z_median', 'acceleration_z_std',
       'gyro_x_mean', 'gyro_x_median', 'gyro_x_std', 'gyro_y_mean',
       'gyro_y_median', 'gyro_y_std', 'gyro_z_mean', 'gyro_z_median',
       'gyro_z_std', 'Speed_mean', 'Speed_median', 'Speed_std']
    for item in element_list:
        temp_arr.append(row[item])
    X.append(temp_arr)

In [122]:
X = np.array(X)
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
print(kmeans.labels_)


[0 1 1 ... 0 0 0]


In [123]:
# Attaching k-means labels back to total_windows dataframe
total_windows['k_label'] = kmeans.labels_
total_windows.head()

Unnamed: 0,second_,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,gyro_y_median,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,Speed_mean,Speed_median,Speed_std,bookingID,k_label
7,10.0,131.080609,129.256974,16.919795,1.081976,1.120484,0.302505,-9.720509,-9.730599,0.089584,...,0.190886,0.05373,-0.050367,-0.05566,0.02072,0.50925,0.485848,0.423826,25769803839,0
11,14.0,160.880304,160.880304,19.365255,0.843381,0.843381,0.226254,-9.644837,-9.644837,0.070025,...,0.150782,0.032745,-0.045754,-0.045754,0.008088,1.234708,1.234708,0.458581,25769803839,1
15,18.0,192.503633,192.503633,19.365255,0.473909,0.473909,0.226254,-9.530487,-9.530487,0.070025,...,0.09731,0.032745,-0.032547,-0.032547,0.008088,1.983567,1.983567,0.458581,25769803839,1
19,22.0,224.126963,224.126963,19.365255,0.104438,0.104438,0.226254,-9.416137,-9.416137,0.070025,...,0.043839,0.032745,-0.01934,-0.01934,0.008088,2.732426,2.732426,0.458581,25769803839,1
23,26.0,255.750292,255.750292,19.365255,-0.265033,-0.265033,0.226254,-9.301788,-9.301788,0.070025,...,-0.009633,0.032745,-0.006133,-0.006133,0.008088,3.481285,3.481285,0.458581,25769803839,1


In [124]:
k_count = total_windows[['bookingID','k_label', 'Bearing_mean']].groupby(['bookingID', 'k_label']).agg(['count']).reset_index().pivot(index='bookingID', columns='k_label').reset_index()
k_count.head()

Unnamed: 0_level_0,bookingID,Bearing_mean,Bearing_mean,Bearing_mean,Bearing_mean
Unnamed: 0_level_1,Unnamed: 1_level_1,count,count,count,count
k_label,Unnamed: 1_level_2,0,1,2,3
0,86,281.0,91.0,46.0,61.0
1,8589934712,,227.0,299.0,
2,8589934761,146.0,231.0,192.0,16.0
3,17179869346,127.0,223.0,118.0,121.0
4,25769803831,44.0,95.0,38.0,306.0


In [125]:
k_count.columns = k_count.columns.droplevel()
k_count.columns = k_count.columns.droplevel()

In [126]:
k_count.columns = ['bookingID', 'count1', 'count2', 'count3', 'count4']

In [127]:
k_count.head()

Unnamed: 0,bookingID,count1,count2,count3,count4
0,86,281.0,91.0,46.0,61.0
1,8589934712,,227.0,299.0,
2,8589934761,146.0,231.0,192.0,16.0
3,17179869346,127.0,223.0,118.0,121.0
4,25769803831,44.0,95.0,38.0,306.0


In [128]:
window_frames_200 = [safe200, dangerous200]
final_processed_df = pd.concat(window_frames_200)
print(len(final_processed_df))
print(final_processed_df.columns)
final_processed_df.head()

79846
Index(['bookingID', 'Accuracy', 'Bearing', 'acceleration_x', 'acceleration_y',
       'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z', 'second', 'Speed',
       'label'],
      dtype='object')


Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
194078,25769803839,16.0,116.175636,0.489133,-9.549826,1.786807,-0.075622,0.065748,-0.001761,3.0,0.017811,0
194121,25769803839,16.0,109.492393,1.443771,-9.830655,1.352554,-0.10916,0.237674,-0.067216,4.0,0.017811,0
194027,25769803839,12.0,291.326538,-0.680688,-9.173145,0.985593,-0.050735,-0.069789,0.008725,27.0,4.323752,0
194043,25769803839,12.0,347.419647,-3.348563,-8.741284,2.1759,-0.014571,-0.50322,0.056381,30.0,3.489704,0
194132,25769803839,7.0,297.993225,0.394327,-10.064081,2.204462,-0.001866,0.113382,0.017042,45.0,6.063122,0


In [142]:
final_processed = final_processed_df[['bookingID', 'Bearing','acceleration_x', 'acceleration_y','acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z', 'second', 'Speed']].groupby(['bookingID']).agg(['mean', 'median', 'std'])
final_processed.columns = ["_".join(x) for x in final_processed.columns.ravel()]
final_processed.head()


Unnamed: 0_level_0,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,acceleration_z_mean,...,gyro_y_std,gyro_z_mean,gyro_z_median,gyro_z_std,second_mean,second_median,second_std,Speed_mean,Speed_median,Speed_std
bookingID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86,135.085,88.5,131.540841,-0.178098,-0.216814,1.004369,9.244298,9.324803,0.609398,-2.646253,...,0.085401,-0.00406,-0.001292,0.058582,917.545,913.0,560.885846,6.455656,4.153853,6.071196
8589934712,217.559931,218.343483,12.171263,0.335627,0.350214,0.788608,-9.711407,-9.711325,0.605855,-0.533632,...,0.107812,-0.007176,-0.001068,0.102533,1196.810526,1088.0,542.643262,16.804474,18.88,9.927667
8589934761,201.424797,267.422729,133.13784,-0.951618,-0.939087,0.840215,-5.849765,-5.950716,0.981014,-7.665871,...,0.061526,0.015428,0.004057,0.077941,1220.453271,1239.0,674.459613,8.331353,4.776841,8.28863
17179869346,192.617512,211.0,118.995823,-0.581278,-0.568096,0.961073,8.66193,8.635426,0.769494,4.667895,...,0.084045,-0.00024,0.000716,0.058637,1197.829493,1242.0,708.232341,10.718477,9.203191,8.462308
25769803831,146.635,95.0,113.898755,0.161738,0.224677,0.719654,9.950146,9.959663,0.46971,0.692206,...,0.057346,-0.004963,-0.004913,0.027353,963.275,902.0,588.294203,11.758677,14.304212,6.925621


In [143]:
final_processed = final_processed.merge(k_count, on='bookingID')
final_processed = final_processed.merge(labels_0, on='bookingID')

In [144]:
del final_processed['label']
final_processed = final_processed.dropna()
final_processed.head()

Unnamed: 0,bookingID,Bearing_mean,Bearing_median,Bearing_std,acceleration_x_mean,acceleration_x_median,acceleration_x_std,acceleration_y_mean,acceleration_y_median,acceleration_y_std,...,second_mean,second_median,second_std,Speed_mean,Speed_median,Speed_std,count1,count2,count3,count4
0,86,135.085,88.5,131.540841,-0.178098,-0.216814,1.004369,9.244298,9.324803,0.609398,...,917.545,913.0,560.885846,6.455656,4.153853,6.071196,281.0,91.0,46.0,61.0
2,8589934761,201.424797,267.422729,133.13784,-0.951618,-0.939087,0.840215,-5.849765,-5.950716,0.981014,...,1220.453271,1239.0,674.459613,8.331353,4.776841,8.28863,146.0,231.0,192.0,16.0
3,17179869346,192.617512,211.0,118.995823,-0.581278,-0.568096,0.961073,8.66193,8.635426,0.769494,...,1197.829493,1242.0,708.232341,10.718477,9.203191,8.462308,127.0,223.0,118.0,121.0
4,25769803831,146.635,95.0,113.898755,0.161738,0.224677,0.719654,9.950146,9.959663,0.46971,...,963.275,902.0,588.294203,11.758677,14.304212,6.925621,44.0,95.0,38.0,306.0
5,25769803839,172.643556,153.889572,61.424392,0.083653,0.055179,0.823304,-9.723296,-9.699362,0.58116,...,796.158192,882.0,432.636103,11.527405,13.638386,7.484059,98.0,97.0,46.0,138.0


In [145]:
bookingID_unique = final_processed['bookingID'].unique()
small_label = labels_0[labels_0['bookingID'].isin(bookingID_unique)]

In [146]:
final_processed = final_processed.sort_values(by='bookingID')
small_label = small_label.sort_values(by='bookingID')

In [148]:
print(len(small_label['bookingID'].unique()))
print(len(final_processed['bookingID'].unique()))
print(len(small_label))
print(len(final_processed))
small_label.head()


368
368
368
368


Unnamed: 0,bookingID,label
54,86,1
222,8589934761,1
252,17179869346,1
298,25769803831,1
302,25769803839,0


### Ensemble Modelling
(Initial Stage): Identify strong and weak learners using cross validation. 

(STAGE 1 Ensemble): Check correlation between predictions made by initial stage models. Choose only models with low correlation as candidate models. 

(STAGE 2 Ensemble): Train the weak learners. Average their outputs and use it as new features to train the strong learners. 

(STAGE 3 Ensemble): Use the output from Stage 2 stong learners to train a final 'meta-learner' model. 

In [150]:
# Models used for ensemble
model = {
    'logistic' : LogisticRegression(max_iter=500),
    'lda' : LinearDiscriminantAnalysis(),
    'svc' : SVC(kernel='rbf'),
    'naivebayes': GaussianNB(),
    'rf' : RandomForestClassifier(n_estimators=100),
    'xgboost' : XGBClassifier(),
    'mlp' : MLPClassifier(max_iter=500)
}

In [151]:
# hyperparameters grid for models that we want to test
model_params = {
    'logistic': {
        'solver' : ['liblinear', 'saga'],
        'C' : [1e-3, 1e-2, 0.1, 1, 10, 100]
    },
    'lda': {
        'solver': ['svd', 'lsqr'],
         'tol': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
    },
    'svc': {
        'gamma': [0.1, 1, 10, 100],
        'C': [0.1, 1, 10, 100, 1000]
    },
    'naivebayes': {
        'var_smoothing': [1e-11, 1e-10, 1e-09, 1e-08, 1e-7]
    },
    'rf': {
        'max_depth': randint(10, 100),
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': randint(1, 4),
        'min_samples_split': randint(2, 10),
        'bootstrap': [True, False]
    },
    'xgboost': {
        'max_depth': randint(1,6),
        'min_child_weight': randint(0,6),
        'subsample': uniform(loc=0.6, scale=0.4),
        'colsample_bytree': uniform(loc=0.6, scale=0.4),
        'gamma': [i/10.0 for i in range(0,5)],
        'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
    }, 
    'mlp': {
        'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive']
    }
}

In [153]:
model_config = {}

col_sample = 0.7
n_init = 5

final_processed = final_processed.dropna()
start_time = time.time()

# for every models
for key in model:
    print("Currently computing for", key)
    
    best_match = {}
    # randomized search through the hyperparameters grid
    rand_search = RandomizedSearchCV(estimator = model[key], 
                                     param_distributions=model_params[key], 
                                     scoring='roc_auc', 
                                     n_iter=5, 
                                     iid=False, 
                                     cv=5, 
                                     n_jobs=-1)
    rand_search.fit(final_processed, small_label['label'])
    
print('Training Done! Time Used:', time.time() - start_time)

Currently computing for logistic
Currently computing for lda
Currently computing for svc
Currently computing for naivebayes
Currently computing for rf
Currently computing for xgboost
Currently computing for mlp
Training Done! Time Used: 16.558780670166016




In [154]:
test_df.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
1,1099511627891,3.0,311.0,0.452518,9.273026,3.098191,0.005382,0.002155,-0.001436,66.0,0.340365
2,962072674446,10.0,142.807999,0.018542,-8.041156,-4.960114,0.004223,-0.014875,0.007397,1552.0,2.05
3,1142461300867,8.0,171.205292,-0.290698,-8.295367,-3.049042,0.033618,0.046224,0.030559,277.0,17.608448
4,412316860548,19.379,0.0,0.675613,8.84613,1.852508,-0.008652,0.003693,0.00116,148.0,0.0
6,738734375093,16.0,290.901459,0.881815,-9.852637,1.785162,0.076716,0.012401,-0.022666,641.0,2.596329


In [155]:
logistic = LogisticRegression(max_iter=500)
logistic.fit(final_processed, small_label['label'])
pred_logistic = logistic.predict_proba(test_df)



ValueError: X has 11 features per sample; expecting 32