In [112]:
import pandas as pd
%matplotlib inline
import glob
import os
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [2]:
DATA_PATH = 'data/safety/safety'

## Data Fields Description

|      Field      |               Description               |
|:---------------:|:---------------------------------------:|
|    bookingID    |                 trip id                 |
|     Accuracy    |    accuracy inferred by GPS in meters   |
|     Bearing     |          GPS bearing in degree          |
|  acceleration_x |  accelerometer reading at x axis (m/s2) |
|  acceleration_y |  accelerometer reading at y axis (m/s2) |
|  acceleration_z |  accelerometer reading at z axis (m/s2) |
|      gyro_x     |   gyroscope reading in x axis (rad/s)   |
|      gyro_y     |   gyroscope reading in y axis (rad/s)   |
|      gyro_z     |   gyroscope reading in z axis (rad/s)   |
|      second     | time of the record by number of seconds |
|      Speed      |       speed measured by GPS in m/s      |



In [3]:
# load labels
labels_path = '{}/labels'.format(DATA_PATH)
labels = pd.read_csv(glob.glob('{}/*.csv'.format(labels_path))[0])
labels = labels.sort_values(by='bookingID')

In [4]:
print(labels.shape)
labels.head()

(20018, 2)


Unnamed: 0,bookingID,label
15035,0,0
13312,1,1
996,2,1
2328,4,1
5192,6,0


In [5]:
features_path = '{}/features'.format(DATA_PATH)
features = pd.DataFrame()
for f in glob.glob('{}/*.csv'.format(features_path)):
    print('loading feature: ', f)
    temp = pd.read_csv(f)
    features = pd.concat([features, temp], axis=0)
features = features.sort_values(by=['bookingID', 'second'])


loading feature:  data/safety/safety/features\part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00009-e6120af0-10c2-42

In [6]:
print(features.shape)
features.head()

(16135561, 11)


Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
1153972,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
712971,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
167611,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
436147,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
1423207,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [7]:
features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bookingID,16135561.0,818481900000.0,495247600000.0,0.0,377957100000.0,807453900000.0,1254130000000.0,1709397000000.0
Accuracy,16135561.0,11.61035,86.91672,0.75,3.9,4.25,8.0,6070.101
Bearing,16135561.0,168.9684,107.2966,0.0,78.0,168.87,262.984,359.9995
acceleration_x,16135561.0,0.06914529,1.424161,-78.419685,-0.5081238,0.06137085,0.635062,66.87346
acceleration_y,16135561.0,4.468976,8.130664,-72.994119,-2.022476,9.081485,9.709778,75.05589
acceleration_z,16135561.0,0.8942974,3.251549,-78.44842,-0.9316101,0.775745,2.750938,78.05576
gyro_x,16135561.0,-0.001710847,0.1445619,-48.45575,-0.02678888,-0.0006432411,0.02330639,39.83975
gyro_y,16135561.0,0.0002800733,0.340063,-74.88861,-0.02993851,0.0002645046,0.03143085,80.31496
gyro_z,16135561.0,-0.0002569688,0.1480824,-53.55445,-0.018765,-3.542004e-05,0.01823425,66.30078
second,16135561.0,3803.484,1436686.0,0.0,241.0,520.0,863.0,1495797000.0


In [8]:
# For each feature, generate aggregated column with 'mean', 'min', 'max', 'std', 25th percentile, median and 75th percentile

def percentile25(x):
    return x.quantile(0.25)

def percentile50(x):
    return x.median()

def percentile75(x):
    return x.quantile(0.75)

aggregate_functions = ['mean', 'min', 'max', 'std', percentile25, percentile50, percentile75]
agg_columns_excluded = ['bookingID', 'second']
agg_dict = {c: aggregate_functions for c in features.columns if c not in agg_columns_excluded}
agg_dict['second'] = ['max']

In [10]:
features_agg = features.groupby(['bookingID'], as_index=True).agg(agg_dict)
features_agg.columns = features_agg.columns.map('_'.join)
features_agg = features_agg.reset_index(drop=False)

In [23]:
features_agg.shape

(20000, 65)

In [62]:
labels_no_duplicate = labels.drop_duplicates(subset='bookingID')

In [65]:
features_agg = pd.merge(features_agg, labels_no_duplicate, how='left', on='bookingID')

In [67]:
features_agg.head(10)

Unnamed: 0,bookingID,Accuracy_mean,Accuracy_min,Accuracy_max,Accuracy_std,Accuracy_percentile25,Accuracy_percentile50,Accuracy_percentile75,Bearing_mean,Bearing_min,...,gyro_z_percentile75,Speed_mean,Speed_min,Speed_max,Speed_std,Speed_percentile25,Speed_percentile50,Speed_percentile75,second_max,label
0,0,10.165339,4.0,48.0,3.855898,8.0,8.0,12.0,176.526099,0.037464,...,0.020893,8.994822,-1.0,22.946083,7.199919,1.490348,8.503366,15.645498,1589.0,0
1,1,3.718763,3.0,7.709,0.597933,3.0,3.9,4.0,124.19859,0.0,...,0.013334,7.881588,-1.0,21.882141,7.059362,0.0,6.904588,13.747929,1034.0,1
2,2,3.930626,3.0,8.0,1.117354,3.0,3.634,4.0,173.794872,1.0,...,0.013371,3.157213,0.0,9.360483,2.897762,0.0,2.998761,5.299983,825.0,1
3,4,10.0,10.0,10.0,0.0,10.0,10.0,10.0,151.807013,2.271227,...,0.031477,6.150996,0.0,19.780001,5.595901,2.19,3.31,10.2225,1094.0,1
4,6,4.586721,3.0,12.0,1.329545,3.9,4.004,4.9385,197.812785,0.0,...,0.022755,4.628921,0.0,16.394695,5.314844,0.0,1.936962,9.21706,1094.0,0
5,7,3.681034,3.0,3.9,0.377849,3.626,3.9,3.9,101.562698,0.0,...,0.048876,12.176386,0.0,25.230654,8.680455,3.545921,13.017325,20.886913,959.0,0
6,8,7.008253,3.0,18.204,3.153024,4.551,6.068,9.102,172.803618,0.0,...,0.007507,5.351266,-1.0,18.27,5.661732,0.0,3.5,10.64,462.0,0
7,10,3.815,3.0,9.0,0.846416,3.017,3.9,3.9,120.605333,0.0,...,0.010696,8.702027,0.0,20.05,7.002632,0.0,9.58,15.445,374.0,0
8,11,4.22236,3.0,8.0,1.049047,3.28,3.9,4.91,140.8,6.0,...,0.034948,6.659024,0.0,17.876741,6.019429,0.429369,5.192059,12.496186,299.0,0
9,13,11.157522,3.0,1251.564,67.183017,3.9,6.0,8.0,212.998371,0.0,...,0.022099,15.521918,-1.0,26.152094,9.09648,7.237872,20.151251,23.439113,1379.0,1


In [108]:
feature_columns = [c for c in features_agg.columns.values if c not in ['bookingID', 'label']]
label_column = 'label'

In [119]:
NUM_SPLITS = 10
splits = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True).split(features_agg[feature_columns], features_agg[label_column])
out_of_fold_predictions = np.zeros(len(features_agg))

In [118]:
a = np.array([0.3, 0.5, 0.6])
(a >= 0.5).astype(int)

array([0, 1, 1])

In [120]:
for i, (train_idx, valid_idx) in enumerate(splits):
    cf = RandomForestRegressor(n_estimators=1000)
    train_X = features_agg[feature_columns].iloc[train_idx]
    train_y = features_agg[label_column].iloc[train_idx]
    val_X = features_agg[feature_columns].iloc[valid_idx]
    val_y = features_agg[label_column].iloc[valid_idx]
    
    cf = RandomForestClassifier()
    cf.fit(train_X, train_y)
    
    train_preds = (cf.predict(train_X) >= 0.5).astype(int)
    val_preds = (cf.predict(val_X) >= 0.5).astype(int)
    
    out_of_fold_predictions[valid_idx] = val_preds
    
    print('Fold:{}\ttrain AUC: {}\tvalid AUC: {}'.format(i + 1, roc_auc_score(train_preds, train_y), roc_auc_score(val_preds, val_y)))

Fold:1	train AUC: 0.985670796137679	valid AUC: 0.6575304381171508
Fold:2	train AUC: 0.9847404173420996	valid AUC: 0.6562292308621837
Fold:3	train AUC: 0.9858742037447124	valid AUC: 0.7025057632234403
Fold:4	train AUC: 0.9845479042851093	valid AUC: 0.6497322728217925
Fold:5	train AUC: 0.9865323553386326	valid AUC: 0.6513529322418958
Fold:6	train AUC: 0.9850736971398492	valid AUC: 0.6915326016410501
Fold:7	train AUC: 0.9860745611889037	valid AUC: 0.6881816424765077
Fold:8	train AUC: 0.9860262472362079	valid AUC: 0.6514717419328083
Fold:9	train AUC: 0.9864718495034597	valid AUC: 0.6753879718899152
Fold:10	train AUC: 0.9855664385913119	valid AUC: 0.6973502257873869


In [122]:
print('out of fold AUC score: ', roc_auc_score(out_of_fold_predictions, features_agg[label_column]))

out of fold AUC score:  0.672476533843384
