In [185]:
import pandas as pd
%matplotlib inline
import glob
import os
import numpy as np
import gc

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [2]:
DATA_PATH = 'data/safety/safety'

## Data Fields Description

|      Field      |               Description               |
|:---------------:|:---------------------------------------:|
|    bookingID    |                 trip id                 |
|     Accuracy    |    accuracy inferred by GPS in meters   |
|     Bearing     |          GPS bearing in degree          |
|  acceleration_x |  accelerometer reading at x axis (m/s2) |
|  acceleration_y |  accelerometer reading at y axis (m/s2) |
|  acceleration_z |  accelerometer reading at z axis (m/s2) |
|      gyro_x     |   gyroscope reading in x axis (rad/s)   |
|      gyro_y     |   gyroscope reading in y axis (rad/s)   |
|      gyro_z     |   gyroscope reading in z axis (rad/s)   |
|      second     | time of the record by number of seconds |
|      Speed      |       speed measured by GPS in m/s      |



In [3]:
# load labels
labels_path = '{}/labels'.format(DATA_PATH)
labels = pd.read_csv(glob.glob('{}/*.csv'.format(labels_path))[0])
labels = labels.sort_values(by='bookingID')

In [4]:
print(labels.shape)
labels.head()

(20018, 2)


Unnamed: 0,bookingID,label
15035,0,0
13312,1,1
996,2,1
2328,4,1
5192,6,0


In [5]:
features_path = '{}/features'.format(DATA_PATH)
features = pd.DataFrame()
for f in glob.glob('{}/*.csv'.format(features_path)):
    print('loading feature: ', f)
    temp = pd.read_csv(f)
    features = pd.concat([features, temp], axis=0)
features = features.sort_values(by=['bookingID', 'second'])

loading feature:  data/safety/safety/features\part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loading feature:  data/safety/safety/features\part-00009-e6120af0-10c2-42

In [6]:
print(features.shape)
features.head()

(16135561, 11)


Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
1153972,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
712971,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
167611,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
436147,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
1423207,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [375]:
def get_stopping_statistics(df):
    # gets every vehicle stop in a trip and returns its start_time, end_time and diff
    
    # make sure all runs of ones are well-bounded
    bounded = np.hstack(([1], df.Speed.values, [1]))

    log = (bounded < 0.5) * 1
    
    # get 1 at run starts and -1 at run ends
    diffs = np.diff(log)    
  
    # get indices if starts and ends
    run_starts = np.where(diffs > 0)[0]
    run_ends = np.where(diffs < 0)[0]
    
    interval = 7
    end_stops = np.array([run_starts,run_ends,run_ends-run_starts]).T
    end_stops = end_stops.astype(int)[:-1,1]
    end_stops = end_stops[end_stops + interval < len(test.Speed.values) - 1]  
    
    n_stops = len(end_stops)
    
    if n_stops > 1:
        hit = np.zeros(shape=(1 ,n_stops))
        for i in range(n_stops):
            # slope at acceleration    
            start = end_stops[i]
            hit[0, i] =  np.diff([test.Speed.values[start], test.Speed.values[start + interval]])
    else:
        hit = np.array([0])
  
    return [n_stops, hit.mean(), hit.max(), hit.std()]

def get_naive_distance(df):
    return (((df['second'].shift(-1) - df['second'])).fillna(0) * df['Speed']).sum()

def get_other_features(df):
    n_stops, hit_mean, hit_max, hit_std = get_stopping_statistics(df)
    naive_dist = get_naive_distance(df)
    
    d = {
        'n_stops': n_stops,
        'hit_mean': hit_mean,
        'hit_max': hit_max,
        'hit_std': hit_std,
        'naive_distance': naive_dist,
    }
    return pd.Series(d, index=['n_stops', 'hit_mean', 'hit_max', 'hit_std', 'naive_distance']) 

In [378]:
def percentile25(x):
    return x.quantile(0.25)

def percentile50(x):
    return x.median()

def percentile75(x):
    return x.quantile(0.75)

aggregate_functions = ['mean', 'min', 'max', 'std', percentile25, percentile50, percentile75]
agg_columns_excluded = ['bookingID', 'second']
agg_dict = {c: aggregate_functions for c in features.columns if c not in agg_columns_excluded}
agg_dict['second'] = ['max']

other_features = features.groupby('bookingID', as_index=True).apply(get_other_features)
other_features = other_features.reset_index()

In [379]:
features_agg = features.groupby(['bookingID'], as_index=True).agg(agg_dict)
features_agg.columns = features_agg.columns.map('_'.join)
features_agg = features_agg.reset_index(drop=False)

features_agg = pd.merge(features_agg, other_features, how='left', on='bookingID')
labels_no_duplicate = labels.drop_duplicates(subset='bookingID')
features_agg = pd.merge(features_agg, labels_no_duplicate, how='left', on='bookingID')

In [380]:
feature_columns = [c for c in features_agg.columns.values if c not in ['bookingID', 'label']]
label_column = 'label'

In [396]:
NUM_SPLITS = 10
splits = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True).split(features_agg[feature_columns], features_agg[label_column])
out_of_fold_predictions = np.zeros(len(features_agg))

In [397]:
for i, (train_idx, valid_idx) in enumerate(splits):
    train_X = features_agg[feature_columns].iloc[train_idx]
    train_y = features_agg[label_column].iloc[train_idx]
    val_X = features_agg[feature_columns].iloc[valid_idx]
    val_y = features_agg[label_column].iloc[valid_idx]
    
    cf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
    cf.fit(train_X, train_y)
    
    train_preds = cf.predict(train_X)
    val_preds = cf.predict(val_X)
    
    out_of_fold_predictions[valid_idx] = val_preds
    
    print('Fold:{}\ttrain AUC: {}\tvalid AUC: {}'.format(i + 1, roc_auc_score(train_preds, train_y), roc_auc_score(val_preds, val_y)))

Fold:1	train AUC: 1.0	valid AUC: 0.7628582732547916
Fold:2	train AUC: 1.0	valid AUC: 0.7562793650793651
Fold:3	train AUC: 1.0	valid AUC: 0.7707392331614262
Fold:4	train AUC: 1.0	valid AUC: 0.7495959144370314
Fold:5	train AUC: 1.0	valid AUC: 0.7436805754084165
Fold:6	train AUC: 1.0	valid AUC: 0.7290522732264441
Fold:7	train AUC: 1.0	valid AUC: 0.7816552250190695
Fold:8	train AUC: 1.0	valid AUC: 0.792783052169577
Fold:9	train AUC: 1.0	valid AUC: 0.770313867016623
Fold:10	train AUC: 1.0	valid AUC: 0.7941111020282116


In [398]:
print('out of fold AUC score: ', roc_auc_score(out_of_fold_predictions, features_agg[label_column]))

out of fold AUC score:  0.7644650842793258


In [399]:
importances = []
for col, importance in zip(feature_columns, cf.feature_importances_):
    importances.append((col, importance))
importances = sorted(importances, key=lambda tup: tup[1], reverse=True)

In [400]:
print('Feature importances: ')
for col, imp in importances:
    print('{}: {}'.format(col, imp))

Feature importances: 
second_max: 0.06627653612093
naive_distance: 0.025802470541310088
Speed_max: 0.021090629460040703
acceleration_z_std: 0.020411697531842674
Bearing_std: 0.019350405025617873
Speed_percentile50: 0.01829747550873446
Speed_mean: 0.018124527157413146
acceleration_x_std: 0.01787351473875824
Speed_std: 0.017235378148340547
Speed_percentile75: 0.01710184609422914
acceleration_x_max: 0.016886161959808292
gyro_y_max: 0.016759439734397908
acceleration_y_std: 0.016615894169450723
acceleration_x_min: 0.016113836058687478
gyro_y_min: 0.01604753187642035
acceleration_z_max: 0.015602162649182707
gyro_x_std: 0.015331311228906753
gyro_z_max: 0.015272940405678316
gyro_x_max: 0.01503813648381193
gyro_x_min: 0.015032923175026199
gyro_z_std: 0.014978218691825855
acceleration_z_min: 0.014558327488749434
gyro_z_min: 0.01446102131847175
acceleration_y_min: 0.014304428517580289
gyro_y_percentile50: 0.014119476988505766
acceleration_y_max: 0.014082401793579445
gyro_z_percentile75: 0.0139277