In [37]:
import pandas as pd
%matplotlib inline
import numpy as np
from dataloader import load_ride_safety_train

# sklearn
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge

from features.features import RideSafetyFeaturesAggregator
from models.lightgbm import LGBWrapper
from models.xgboost import XGBWrapper
from models.sklearn import SklearnWrapper
from utils import timer

from models.helpers import train_cv, train_cv_predict_test


In [16]:
DATA_PATH = 'data/safety/safety'


## Data Fields Description

|      Field      |               Description               |
|:---------------:|:---------------------------------------:|
|    bookingID    |                 trip id                 |
|     Accuracy    |    accuracy inferred by GPS in meters   |
|     Bearing     |          GPS bearing in degree          |
|  acceleration_x |  accelerometer reading at x axis (m/s2) |
|  acceleration_y |  accelerometer reading at y axis (m/s2) |
|  acceleration_z |  accelerometer reading at z axis (m/s2) |
|      gyro_x     |   gyroscope reading in x axis (rad/s)   |
|      gyro_y     |   gyroscope reading in y axis (rad/s)   |
|      gyro_z     |   gyroscope reading in z axis (rad/s)   |
|      second     | time of the record by number of seconds |
|      Speed      |       speed measured by GPS in m/s      |



In [17]:
features, labels = load_ride_safety_train('{}/{}'.format(DATA_PATH, 'features'), '{}/{}'.format(DATA_PATH, 'labels'))


loaded feature file:  data/safety/safety/features\part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/safety/features\part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/saf

In [20]:
features.shape


(16135561, 11)

In [22]:
print(features.shape)
features.head()


(16135561, 11)


Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
1153972,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
712971,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
167611,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
436147,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
1423207,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [24]:
with timer('Aggregating features: '):
    feature_aggregator = RideSafetyFeaturesAggregator(features)
    features_agg = feature_aggregator.get_aggregated_features()


----Aggregating features:  started
----Aggregating features:  done in 270 seconds


In [30]:
labels_no_duplicate = labels.drop_duplicates(subset='bookingID')
features_agg = pd.merge(features_agg, labels_no_duplicate, how='left', on='bookingID')


In [32]:
features_agg = features_agg.drop(['label_y'], axis=1)


In [27]:
feature_columns = [c for c in features_agg.columns.values if c not in ['bookingID', 'label']]
label_column = 'label'


In [31]:
features_agg.head().T


Unnamed: 0,0,1,2,3,4
bookingID,0.000000,1.000000,2.000000,4.000000,6.000000
Accuracy_mean,10.165339,3.718763,3.930626,10.000000,4.586721
Accuracy_min,4.000000,3.000000,3.000000,10.000000,3.000000
Accuracy_max,48.000000,7.709000,8.000000,10.000000,12.000000
Accuracy_std,3.855898,0.597933,1.117354,0.000000,1.329545
Accuracy_percentile25,8.000000,3.000000,3.000000,10.000000,3.900000
Accuracy_percentile50,8.000000,3.900000,3.634000,10.000000,4.004000
Accuracy_percentile75,12.000000,4.000000,4.000000,10.000000,4.938500
Bearing_mean,176.526099,124.198590,173.794872,151.807013,197.812785
Bearing_min,0.037464,0.000000,1.000000,2.271227,0.000000


## Modelling Part


### Ensemble of ridge regression on top of stacked LightGBM, XGBoost, RandomForest and ExtraTrees


In [38]:
NUM_SPLITS = 10


In [39]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'nrounds': 50000,
    'early_stop_rounds': 2000,
    # trainable params
    'max_depth': 4,
    'num_leaves': 46,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.9,
    'bagging_freq': 8,
    'learning_rate': 0.019,
    'verbose': 0
}
lgb_wrapper = LGBWrapper(lgbm_params)

with timer('Training LightGBM'):
    lgb_oof_train = train_cv(lgb_wrapper, num_splits=NUM_SPLITS, X=features_agg[feature_columns], y=features_agg[label_column])


----Training LightGBM started
Training for fold:  1
Training until validation scores don't improve for 2000 rounds.
[1000]	training's rmse: 0.356393	valid_1's rmse: 0.397818
[2000]	training's rmse: 0.324814	valid_1's rmse: 0.400048
Early stopping, best iteration is:
[416]	training's rmse: 0.37694	valid_1's rmse: 0.396919
Training for fold:  2
Training until validation scores don't improve for 2000 rounds.
[1000]	training's rmse: 0.355517	valid_1's rmse: 0.398797
[2000]	training's rmse: 0.325105	valid_1's rmse: 0.399669
Early stopping, best iteration is:
[884]	training's rmse: 0.359499	valid_1's rmse: 0.398554
Training for fold:  3
Training until validation scores don't improve for 2000 rounds.
[1000]	training's rmse: 0.354959	valid_1's rmse: 0.402171
[2000]	training's rmse: 0.324645	valid_1's rmse: 0.405086
Early stopping, best iteration is:
[280]	training's rmse: 0.382283	valid_1's rmse: 0.399594
Training for fold:  4
Training until validation scores don't improve for 2000 rounds.
[10

In [40]:
# XGB
xgb_params = {
    'eval_metric': 'rmse',
    'device': 'cpu',
    'silent': 1,
    'seed': 1337,
    'nrounds': 60000,
    'early_stop_rounds': 2000,
    # trainable params
    'eta': 0.025,
    'subsample': 0.8,
    'colsample_bytree': 0.6000000000000001,
    'gamma': 0.65,
    'max_depth': 5,
    'min_child_weight': 5.0,
    'n_estimators': 500,
}

xgbWrapper = XGBWrapper(xgb_params)

with timer('Training Xgboost'):
    xgb_oof_train = train_cv(xgbWrapper, num_splits=NUM_SPLITS, X=features_agg[feature_columns], y=features_agg[label_column])


----Training Xgboost started
Training for fold:  1
[0]	train-rmse:0.495986	valid-rmse:0.496144
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2000 rounds.
[1000]	train-rmse:0.310271	valid-rmse:0.399176
[2000]	train-rmse:0.264811	valid-rmse:0.401426
Stopping. Best iteration:
[240]	train-rmse:0.366753	valid-rmse:0.397454

Training for fold:  2
[0]	train-rmse:0.496019	valid-rmse:0.496043
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2000 rounds.
[1000]	train-rmse:0.311868	valid-rmse:0.39332
[2000]	train-rmse:0.26583	valid-rmse:0.395369
Stopping. Best iteration:
[349]	train-rmse:0.357771	valid-rmse:0.391815

Training for fold:  3
[0]	train-rmse:0.496045	valid-rmse:0.496229
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2000 rounds.


  if getattr(data, 'base', None) is not None and \


In [41]:
# random forest
rf_params = {
    'n_jobs': -1,
    'n_estimators': 150,
    'max_features': 'auto',
    'min_samples_leaf': 2,
}

random_forest_clf = SklearnWrapper(clf=RandomForestRegressor, seed=1337, params=rf_params)

with timer('Training random forest'):
    rf_oof_train = train_cv(random_forest_clf, num_splits=NUM_SPLITS, X=features_agg[feature_columns], y=features_agg[label_column])


----Training random forest started
Training for fold:  1
Training for fold:  2
Training for fold:  3
Training for fold:  4
Training for fold:  5
Training for fold:  6
Training for fold:  7
Training for fold:  8
Training for fold:  9
Training for fold:  10
----Training random forest done in 168 seconds


In [42]:
# extra trees classifier
et_params = {
    'n_jobs': -1,
    'n_estimators': 150,
    'max_features': 'auto',
    'max_depth': 5,
    'min_samples_leaf': 2,
}
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=1220, params=et_params)

with timer('Training Extra trees'):
    et_oof_train = train_cv(et, num_splits=NUM_SPLITS, X=features_agg[feature_columns], y=features_agg[label_column])


----Training Extra trees started
Training for fold:  1
Training for fold:  2
Training for fold:  3
Training for fold:  4
Training for fold:  5
Training for fold:  6
Training for fold:  7
Training for fold:  8
Training for fold:  9
Training for fold:  10
----Training Extra trees done in 8 seconds


In [43]:
x_train_ensemble = pd.DataFrame(np.concatenate((et_oof_train, rf_oof_train, lgb_oof_train, xgb_oof_train), axis=1))


In [44]:
# Stack, combine and train ridge regressor
ridge_params = {
    'alpha':50.0, 
    'fit_intercept':True, 
    'normalize':False, 
    'copy_X':True,
    'max_iter':None, 
    'tol':0.001, 
    'solver':'auto', 
    'random_state':1337
}
ridge = SklearnWrapper(clf=Ridge, seed=1337, params=ridge_params)

with timer('Training stacked ridge regression'):
    final_oof_train, final_oof_test = train_cv_predict_test(ridge, num_splits=NUM_SPLITS, X=x_train_ensemble, y=features_agg[label_column], X_test=x_train_ensemble)


----Training stacked ridge regression started
----Training stacked ridge regression done in 0 seconds


In [48]:
final_oof_train[:10]


array([[0.52103183],
       [0.25830568],
       [0.35511164],
       [0.21800488],
       [0.20140592],
       [0.18011301],
       [0.16558963],
       [0.09473403],
       [0.10493071],
       [0.1916465 ]])

In [47]:
max_auc = 0
max_thres = 0.1
for thres in np.linspace(0.1, 0.99, 89):
    binarized_oof = (final_oof_train >= thres).astype(int)
    if roc_auc_score(features_agg[label_column].values, binarized_oof) > max_auc:
        max_auc = roc_auc_score(features_agg[label_column].values, binarized_oof)
        max_thres = thres
    
print('out of fold AUC score: ', max_auc)
print('max thres: ', max_thres)



out of fold AUC score:  0.6641729111994971
max thres:  0.2415909090909091
