### Reminder
#### Notes before modelling:
Model 1) 

columns: scaling: passenger_count, trip_distance, PUSize, DOSize one-hot- encoded: trip_type

Model 2) 

columns: scaling: passenger_count, trip_distance, PUSize, DOSize, PULocationID (Optional), DOLocationID (Optional), speed one-hot- encoded: trip_type, season, PUBorough, DOBorough

In [8]:
import pandas as pd
import numpy as np
import os
os.chdir('E:/R_files/yellowcab_analysis/src')
from toolkit.etl_toolkit import ingest_data, preprocess_data, engineering_toolkit
from toolkit.analysis_toolkit import evaluate_xgb, mape
from generators.drivetime_generator import batch_generator, drivetime_data_generator
from models.drivetime_model import generate_drivetime_model

### Drivetime model section

XGBoost.
columns: scaling: passenger_count, trip_distance, PUSize, DOSize one-hot- encoded: trip_type

In [None]:
zone_lookup = pd.read_csv('https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv')

In [None]:
columns_to_keep = ['PULocationID', 
                   'DOLocationID', 
                   'tpep_pickup_datetime', 
                   'tpep_dropoff_datetime', 
                   'passenger_count',
                   'trip_distance', 
                   'payment_type',
                   'fare_amount',
                   'extra',
                   'tip_amount']

cat_columns = ['payment_type']

batch_size = 150000
ind = 0

yellow_06_19 = preprocess_data(ingest_data('2019', '06').loc[ind*batch_size:(ind + 1)*batch_size, :].compute(), zone_lookup, columns_to_keep, cat_columns)

In [None]:
len(yellow_06_19)

In [None]:
yellow_06_19 = engineering_toolkit(yellow_06_19, ['borough_size', 'trip_type'], zone_lookup)

In [None]:
yellow_06_19 = yellow_06_19[['passenger_count', 'trip_distance', 'PUSize', 'DOSize', 'trip_type', 'drivetime']]

In [None]:
quick_model, evals_result = generate_drivetime_model(batch = yellow_06_19, 
                                                     eval_metric = 'rmse',
                                                     incremental_learning = False,
                                                     grid_search = False,
                                                     early_stopping_rounds = 100)

In [None]:
(X_train, y_train), (X_test, y_test) = batch_generator(yellow_06_19.head(10000))
#Additional metric - mape
print(f'Training set. Mean absolute percentage error: {mape(quick_model.predict(xgb.DMatrix(X_train, y_train)), y_train)}')
print(f'Validation set. Mean absolute percentage error: {mape(quick_model.predict(xgb.DMatrix(X_test, y_test)), y_test)}')

In [None]:
quick_model.feature_names = ['passenger_count', 'trip_distance', 'PUSize', 'DOSize', 'day', 'rush_hour', 'night']
xgb.plot_importance(quick_model)

In [None]:
evaluate_xgb(evals_result)

In [None]:
evaluate_xgb(evals_result, (100, 800), 'train')

In [None]:
prolonged_model, evals_result = generate_drivetime_model(batch = yellow_06_19, 
                                                         eval_metric = 'rmse',
                                                         incremental_learning = False,
                                                         early_stopping_rounds = 2000,
                                                         grid_search = False)

In [None]:
#Additional metric - mape
print(f'Training set. Mean absolute percentage error: {mape(prolonged_model.predict(xgb.DMatrix(X_train, y_train)), y_train)}')
print(f'Validation set. Mean absolute percentage error: {mape(prolonged_model.predict(xgb.DMatrix(X_test, y_test)), y_test)}')

In [None]:
evaluate_xgb(evals_result, (900, 1200), 'train')

In [None]:
# Define parameter space
param_space = {'learning_rate': hp.uniform('learning_rate', 0.01, 0.3), 
               'n_round': scope.int(hp.quniform('n_round', 200, 3000, 100)),
               'max_depth': scope.int(hp.quniform('max_depth', 5, 16, 1)), 
               'gamma': hp.uniform('gamma', 0, 10), 
               'min_child_weight': hp.uniform('min_child_weight', 0, 10),
               'subsample': hp.uniform('subsample', 0.1, 1), 
               'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1)
               }

In [None]:
optimal_hp, trials = generate_drivetime_model(batch = yellow_06_19, 
                                              incremental_learning = False,
                                              grid_search = True,
                                              param_space = param_space,
                                              n_trials = 1000)

In [None]:
params_ = optimal_hp
params_['max_depth'] = int(optimal_hp['max_depth'])
n_rounds_best = int(optimal_hp['n_round'])
del params_['n_round']
print(params_)
print(n_rounds_best)

In [None]:
f, ax = plt.subplots(1)
xs = [t['tid'] for t in trials.trials]
ys = [t['result']['loss'] for t in trials.trials]
ax.scatter(xs, ys, s=20, linewidth=0.01, alpha=0.75)
ax.set_title('loss over time', fontsize=18)
ax.set_xlabel('trials', fontsize=16)
ax.set_ylabel('loss', fontsize=16)

In [None]:
params = params_
params.update({'verbosity' : 0})
params.update({'eval_metric': 'rmse'})
params.update({'n_round': n_rounds_best})
model, evals_result = generate_drivetime_model(batch = yellow_06_19, 
                                               grid_search = False,
                                               early_stopping_rounds = 10,
                                               params = params_,
                                               incremental_learning = False)

In [None]:
#Additional metric - mape
print(f'Training set. Mean absolute percentage error: {mape(model.predict(xgb.DMatrix(X_train, y_train)), y_train)}')
print(f'Validation set. Mean absolute percentage error: {mape(model.predict(xgb.DMatrix(X_test, y_test)), y_test)}')

In [None]:
evaluate_xgb(evals_result)

In [None]:
evaluate_xgb(evals_result, (45, 160), 'train')

### Bulk model section

In [None]:
columns_to_keep = ['PULocationID', 
                   'DOLocationID', 
                   'tpep_pickup_datetime', 
                   'tpep_dropoff_datetime', 
                   'passenger_count',
                   'trip_distance', 
                   'payment_type',
                   'fare_amount',
                   'extra',
                   'tip_amount']

cat_columns = ['payment_type']

In [None]:
from tqdm import tqdm

model = () 
train_loss = []
eval_loss = []

for month in ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']:
    fetching_table = ingest_data('2019', month).compute()
    gen = drivetime_data_generator(columns_to_keep,
                                   cat_columns,
                                   fetching_table,
                                   zone_lookup,
                                   batch_size = 10000,
                                   shuffle = True
                )
    for i, ind in zip(tqdm(range(gen.__len__())), range(gen.__len__())):
        batch = gen.__getitem__(ind)
        model, evals_result = generate_drivetime_model(batch = batch, 
                                                   grid_search = False,
                                                   early_stopping_rounds = 100,
                                                   params = params,
                                                   incremental_learning = True)

        eval_loss.append(min(evals_result['eval']['rmse']))
        train_loss.append(min(evals_result['train']['rmse']))

evals_result_agg = {'train':{'rmse' : eval_loss}, 'eval' : {'rmse' : train_loss}}


In [None]:
(X_train, y_train), (X_test, y_test) = gen.__getitem__(0)
#Additional metric - mape
print(f'Training set. Mean absolute percentage error: {mape(model.predict(xgb.DMatrix(X_train, y_train)), y_train)}')
print(f'Validation set. Mean absolute percentage error: {mape(model.predict(xgb.DMatrix(X_test, y_test)), y_test)}')

In [None]:
evaluate_xgb(evals_result_agg)