In [1]:
%load_ext autoreload
%autoreload 2
import json
import pandas as pd
import numpy as np
from dataloader import load_features_and_labels

# sklearn
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold

# private modules
from features.features import RideSafetyFeaturesAggregator
from models.wrappers.lightgbm import LGBWrapper
from models.wrappers.xgboost import XGBWrapper
from models.wrappers.sklearn import SklearnWrapper
from utils.utils import timer, json_to_dict, find_best_threshold_naive


In [2]:
DATA_PATH = 'data/safety'

## Data Fields Description

|      Field      |               Description               |
|:---------------:|:---------------------------------------:|
|    bookingID    |                 trip id                 |
|     Accuracy    |    accuracy inferred by GPS in meters   |
|     Bearing     |          GPS bearing in degree          |
|  acceleration_x |  accelerometer reading at x axis (m/s2) |
|  acceleration_y |  accelerometer reading at y axis (m/s2) |
|  acceleration_z |  accelerometer reading at z axis (m/s2) |
|      gyro_x     |   gyroscope reading in x axis (rad/s)   |
|      gyro_y     |   gyroscope reading in y axis (rad/s)   |
|      gyro_z     |   gyroscope reading in z axis (rad/s)   |
|      second     | time of the record by number of seconds |
|      Speed      |       speed measured by GPS in m/s      |



In [3]:
features, labels = load_features_and_labels('{}/{}'.format(DATA_PATH, 'features'), '{}/{}'.format(DATA_PATH, 'labels'))

loaded feature file:  data/safety/features\part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
loaded feature file:  data/safety/features\part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv


In [4]:
features.shape

(16135561, 11)

In [5]:
print(features.shape)
features.head()

(16135561, 11)


Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
1153972,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
712971,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
167611,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
436147,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
1423207,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [6]:
with timer('Aggregating features: '):
    feature_aggregator = RideSafetyFeaturesAggregator(features)
    features_agg = feature_aggregator.get_aggregated_features()

----Aggregating features:  started
----Aggregating features:  done in 286 seconds


In [7]:
labels_no_duplicate = labels.drop_duplicates(subset='bookingID')
features_agg = pd.merge(features_agg, labels_no_duplicate, how='left', on='bookingID')

In [8]:
features_agg.head().T

Unnamed: 0,0,1,2,3,4
bookingID,0.000000,1.000000,2.000000,4.000000,6.000000
Accuracy_mean,10.165339,3.718763,3.930626,10.000000,4.586721
Accuracy_min,4.000000,3.000000,3.000000,10.000000,3.000000
Accuracy_max,48.000000,7.709000,8.000000,10.000000,12.000000
Accuracy_std,3.855898,0.597933,1.117354,0.000000,1.329545
Accuracy_percentile25,8.000000,3.000000,3.000000,10.000000,3.900000
Accuracy_percentile50,8.000000,3.900000,3.634000,10.000000,4.004000
Accuracy_percentile75,12.000000,4.000000,4.000000,10.000000,4.938500
Bearing_mean,176.526099,124.198590,173.794872,151.807013,197.812785
Bearing_min,0.037464,0.000000,1.000000,2.271227,0.000000


In [9]:
feature_columns = [c for c in features_agg.columns.values if c not in ['bookingID', 'label']]
label_column = 'label'

## Modelling Part


### Ensemble of Ridge Regression on top of stacked LightGBM, XGBoost, RandomForest and ExtraTrees


In [10]:
NUM_SPLITS = 5
X = features_agg[feature_columns].values
y = features_agg[label_column].values


In [11]:
lgbm_params = json_to_dict('models/params/lightgbm.json')

with timer('Training LightGBM'):
    lgb_oof_train = np.zeros((len(X),))

    for i, (train_index, val_index) in enumerate(StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True).split(X, y)):
        print('Training for fold: ', i + 1)

        clf = LGBWrapper(lgbm_params)
        x_tr = X[train_index, :]
        y_tr = y[train_index]
        x_val = X[val_index, :]
        y_val = y[val_index]

        clf.train(x_tr, y_tr, x_val=x_val, y_val=y_val)

        lgb_oof_train[val_index] = clf.predict(x_val)
        clf.save('models/saved_models/lightgbm/{}.pkl'.format(i))

----Training LightGBM started
Training for fold:  1
training with validation dataset...
self param:  {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'max_depth': 2, 'num_leaves': 46, 'feature_fraction': 0.6, 'bagging_fraction': 0.9, 'bagging_freq': 8, 'learning_rate': 0.019, 'verbose': 0}
Training until validation scores don't improve for 2000 rounds.
[1000]	training's rmse: 0.386425	valid_1's rmse: 0.396639
[2000]	training's rmse: 0.378922	valid_1's rmse: 0.396887
[3000]	training's rmse: 0.372448	valid_1's rmse: 0.397797
Early stopping, best iteration is:
[1286]	training's rmse: 0.384081	valid_1's rmse: 0.396508
Training for fold:  2
training with validation dataset...
self param:  {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'max_depth': 2, 'num_leaves': 46, 'feature_fraction': 0.6, 'bagging_fraction': 0.9, 'bagging_freq': 8, 'learning_rate': 0.019, 'verbose': 0}
Training until validation scores don't 

In [12]:
# XGB
xgb_params = json_to_dict('models/params/xgboost.json')
with timer('Training XGBoost'):
    xgb_oof_train = np.zeros((len(X),))

    for i, (train_index, val_index) in enumerate(StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True).split(X, y)):
        print('Training for fold: ', i + 1)

        clf = XGBWrapper(xgb_params)
        x_tr = X[train_index, :]
        y_tr = y[train_index]
        x_val = X[val_index, :]
        y_val = y[val_index]

        clf.train(x_tr, y_tr, x_val=x_val, y_val=y_val)

        xgb_oof_train[val_index] = clf.predict(x_val)
        clf.save('models/saved_models/xgboost/{}.pkl'.format(i))

----Training XGBoost started
Training for fold:  1
training with validation dataset
[0]	train-rmse:0.496142	valid-rmse:0.496132
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2000 rounds.
[1000]	train-rmse:0.382926	valid-rmse:0.394225
[2000]	train-rmse:0.37214	valid-rmse:0.395229
Stopping. Best iteration:
[912]	train-rmse:0.383999	valid-rmse:0.394145

Training for fold:  2
training with validation dataset
[0]	train-rmse:0.496272	valid-rmse:0.496308
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2000 rounds.
[1000]	train-rmse:0.381897	valid-rmse:0.398526
[2000]	train-rmse:0.371262	valid-rmse:0.399318
Stopping. Best iteration:
[669]	train-rmse:0.38612	valid-rmse:0.398391

Training for fold:  3
training with validation dataset
[0]	train-rmse:0.496108	valid-rmse:0.496152
Multiple eval metrics have been passed: 'valid-r

In [13]:
# random forest
rf_params = json_to_dict('models/params/rf.json')

with timer('Training random forest'):
    rf_oof_train = np.zeros((len(X),))

    for i, (train_index, val_index) in enumerate(StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True).split(X, y)):
        print('Training for fold: ', i + 1)

        clf = SklearnWrapper(RandomForestRegressor, seed=1337, params=rf_params)
        x_tr = X[train_index, :]
        y_tr = y[train_index]
        x_val = X[val_index, :]
        y_val = y[val_index]

        clf.train(x_tr, y_tr, x_val=x_val, y_val=y_val)

        rf_oof_train[val_index] = clf.predict(x_val)
        clf.save('models/saved_models/rf/{}.pkl'.format(i))

----Training random forest started
Training for fold:  1
Training for fold:  2
Training for fold:  3
Training for fold:  4
Training for fold:  5
----Training random forest done in 12 seconds


In [14]:
rf_clf = clf.clf


In [15]:
# extra trees classifier
et_params = json_to_dict('models/params/et.json')

with timer('Training Extra Trees regressor'):
    et_oof_train = np.zeros((len(X),))

    for i, (train_index, val_index) in enumerate(StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True).split(X, y)):
        print('Training for fold: ', i + 1)

        clf = SklearnWrapper(ExtraTreesRegressor, seed=1337, params=et_params)
        x_tr = X[train_index, :]
        y_tr = y[train_index]
        x_val = X[val_index, :]
        y_val = y[val_index]

        clf.train(x_tr, y_tr, x_val=x_val, y_val=y_val)

        et_oof_train[val_index] = clf.predict(x_val)
        clf.save('models/saved_models/et/{}.pkl'.format(i))

----Training Extra Trees regressor started
Training for fold:  1
Training for fold:  2
Training for fold:  3
Training for fold:  4
Training for fold:  5
----Training Extra Trees regressor done in 3 seconds


In [16]:
# ensembling order: et --> rf --> lgb --> xgb
x_train_ensemble = np.concatenate((et_oof_train.reshape(-1, 1), rf_oof_train.reshape(-1, 1), lgb_oof_train.reshape(-1, 1), xgb_oof_train.reshape(-1, 1)), axis=1)

In [17]:
# Stack, combine and train ridge regressor
ridge_params = json_to_dict('models/params/ridge.json')
ridge = SklearnWrapper(clf=Ridge, seed=1337, params=ridge_params)

In [18]:
with timer('Training final stacked ridge regression'):
    final_oof_train = np.zeros((len(X),))

    for i, (train_index, val_index) in enumerate(StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True).split(x_train_ensemble, y)):
        print('Training for fold: ', i + 1)

        clf = SklearnWrapper(Ridge, seed=1336, params=ridge_params)
        x_tr = x_train_ensemble[train_index, :]
        y_tr = y[train_index]
        x_val = x_train_ensemble[val_index, :]
        y_val = y[val_index]

        clf.train(x_tr, y_tr, x_val=x_val, y_val=y_val)
        
        clf.save('models/saved_models/ridge/{}.pkl'.format(i))
        final_oof_train[val_index] = clf.predict(x_val)

----Training final stacked ridge regression started
Training for fold:  1
Training for fold:  2
Training for fold:  3
Training for fold:  4
Training for fold:  5
----Training final stacked ridge regression done in 0 seconds


In [19]:
final_oof_train[:10]

array([0.49918049, 0.28266139, 0.35426647, 0.22015504, 0.24911391,
       0.19513402, 0.15722006, 0.15925522, 0.10416355, 0.24482504])

In [20]:
best_threshold = find_best_threshold_naive(y, final_oof_train)
final_train_predictions = (final_oof_train >= best_threshold).astype(int)
    
print('out of fold AUC score: ', roc_auc_score(y, final_train_predictions))
print('max thres: ', best_threshold)
general_params = {
    'best_threshold': best_threshold
}


out of fold AUC score:  0.665455987575857
max thres:  0.25


In [21]:
with open('models/params/general.json', 'w') as f:  
    json.dump(general_params, f)

In [22]:
data = []
for (col, imp) in zip(feature_columns, rf_clf.feature_importances_):
    data.append((col, imp))

In [23]:
data = sorted(data, key=lambda tup: tup[1], reverse=True)

In [24]:
for (col, imp) in data:
    print('col: {}, imp: {}'.format(col, imp))


col: second_max, imp: 0.9191272115301357
col: acceleration_z_std, imp: 0.05867889502451782
col: acceleration_x_std, imp: 0.011820964746787451
col: acceleration_y_std, imp: 0.0036416981247743897
col: gyro_z_std, imp: 0.003207331287051644
col: Speed_mean, imp: 0.0013741152562031932
col: Bearing_std, imp: 0.0013528461024069536
col: acceleration_z_max, imp: 0.0007969379281229096
col: Accuracy_mean, imp: 0.0
col: Accuracy_min, imp: 0.0
col: Accuracy_max, imp: 0.0
col: Accuracy_std, imp: 0.0
col: Accuracy_percentile25, imp: 0.0
col: Accuracy_percentile50, imp: 0.0
col: Accuracy_percentile75, imp: 0.0
col: Bearing_mean, imp: 0.0
col: Bearing_min, imp: 0.0
col: Bearing_max, imp: 0.0
col: Bearing_percentile25, imp: 0.0
col: Bearing_percentile50, imp: 0.0
col: Bearing_percentile75, imp: 0.0
col: acceleration_x_mean, imp: 0.0
col: acceleration_x_min, imp: 0.0
col: acceleration_x_max, imp: 0.0
col: acceleration_x_percentile25, imp: 0.0
col: acceleration_x_percentile50, imp: 0.0
col: acceleration_x