In [19]:
import pickle as pkl
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from IPython.display import clear_output

%run utils/utils_metrics.py

import tensorflow as tf
from tensorflow import keras

### 1. Load data

In [2]:
output_scaler = pkl.load(open('../features_extracted/scalers/out_scaler.scl', 'rb'))
dataset = pkl.load(open('../features_extracted/taxi_trip_data_normalized_10M.dat', 'rb'))
dataset.head()

Unnamed: 0,pu_lon,pu_lat,do_lon,do_lat,vec_dist,grid_trip_dist,avg_hour_sin,avg_hour_cos,week_day_sin,week_day_cos,duration
0,-0.618153,0.43822,1.350339,3.926724,2.505878,1.943777,-0.866025,0.5,0.781831,0.62349,0.148974
1,-0.571665,-0.016438,-0.551556,-0.526279,-0.443558,-0.434851,0.5,-0.8660254,0.433884,-0.900969,0.822718
2,0.022083,0.039781,-0.233493,0.183234,-0.648665,-0.619845,-0.5,-0.8660254,-0.781831,0.62349,-0.503036
3,-0.491446,-0.228516,-0.339226,-0.690281,-0.46787,-0.452716,-0.5,0.8660254,0.433884,-0.900969,-0.750465
4,-0.537286,-0.348061,-0.057487,-0.095531,-0.448858,-0.441419,-1.0,-1.83697e-16,-0.974928,-0.222521,-0.146938


In [46]:
dataset.duration.max()

10.71153634879715

In [47]:
dataset.duration.min()

-1.3205560167264712

In [3]:
dataset['denorm_duration'] = output_scaler.inverse_transform(dataset.duration)
dataset.drop(dataset[dataset.denorm_duration == 0].index, inplace=True)
dataset.reset_index(inplace=True)
dataset.drop('index', axis=1, inplace=True)
dataset.drop('denorm_duration', axis=1, inplace=True)
dataset.shape

(9999848, 11)

### 2. Splitting data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(dataset[dataset.columns[:-1]], 
                                                    dataset[dataset.columns[[-1]]],
                                                    test_size=0.3, random_state=42)
test_size = len(y_test)
train_size = len(y_train)

y_test_denorm = output_scaler.inverse_transform(y_test)

### 4. Build model

In [52]:
def build_model():
    model = keras.Sequential([
    keras.layers.Dense(100, activation  = tf.nn.relu,
                           input_shape = (X_train.shape[1],)),
    keras.layers.Dense(50, activation=tf.nn.relu),
    keras.layers.Dense(25, activation=tf.nn.relu),
    keras.layers.Dense(1)
    ])

    optimizer = tf.train.AdamOptimizer(0.01)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae'])
    return model

model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 100)               1100      
_________________________________________________________________
dense_20 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_21 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 26        
Total params: 7,451
Trainable params: 7,451
Non-trainable params: 0
_________________________________________________________________


### 5. Training

In [53]:
EPOCHS = 10
    
# Store training stats
history = model.fit(X_train, y_train, 
                    epochs=EPOCHS,
                    validation_split=0.2, 
                    verbose=1)
                    #, callbacks=[PrintDot()])

Train on 5599914 samples, validate on 1399979 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
y_pred = model.predict(X_test)
y_pred_denorm = output_scaler.inverse_transform(y_pred)

np.concatenate((y_pred_denorm, y_test_denorm),axis=1);
100*np.mean(np.abs(y_pred_denorm - y_test_denorm)/y_test_denorm)

35.22210981494661

### Load trained models for computing metrics

In [117]:
output_scaler = pkl.load(open('../features_extracted/scalers/out_scaler.scl', 'rb'))

In [149]:
# model_id = '2018-12-13 22_39_11.523054'      # densenet1
# model_id = '2018-12-13 22_51_58.230196'      # densenet2
# model_id = '2018-12-14 02_23_45.104542'      # densenet3
# model_id = '2018-12-14 11_54_29.011963'      # densenet4.1
model_id = '2018-12-14 18_09_28.892350'      # densenet4.2


y_test_name = 'y_test.dat'
y_pred_name = 'y_pred_' + model_id + '.dat'

model_name = 'model_' + model_id + '.json'
weights_name = 'model_' + model_id + '.h5'

In [150]:
y_test = pkl.load(open('../deep_models/' + y_test_name, 'rb'))
y_pred = pkl.load(open('../deep_models/' + y_pred_name, 'rb'))

In [151]:
y_test_denorm = output_scaler.inverse_transform(y_test)
y_pred_denorm = output_scaler.inverse_transform(y_pred)

In [152]:
# Set all negative outputs to zero (we're predicting time)
y_pred_denorm[y_pred_denorm < 0] = 0

# Transform back again
y_pred = output_scaler.transform(y_pred_denorm)

In [133]:
print('ID =', model_id)
compute_metrics(y_true = y_test.values, y_pred = y_pred.reshape(1,-1),
                y_true_denorm = y_test_denorm, y_pred_denorm = y_pred_denorm.reshape(1,-1),
                indices=[0])

ID = 2018-12-13 22_39_11.523054


Unnamed: 0,Exp. Var.,R2,MSE,RMSE,nMSE,nRMSE,MSLE,RMSLE,MAPE
0,0.810154,0.810128,68090.369474,260.941314,0.190311,0.436247,0.104263,0.322897,25.916725


In [138]:
print('ID =', model_id)
compute_metrics(y_true = y_test.values, y_pred = y_pred.reshape(1,-1),
                y_true_denorm = y_test_denorm, y_pred_denorm = y_pred_denorm.reshape(1,-1),
                indices=[0])

ID = 2018-12-13 22_51_58.230196


Unnamed: 0,Exp. Var.,R2,MSE,RMSE,nMSE,nRMSE,MSLE,RMSLE,MAPE
0,0.81155,0.8114,67634.355624,260.06606,0.189037,0.434784,0.102288,0.319824,25.700501


In [143]:
print('ID =', model_id)
compute_metrics(y_true = y_test.values, y_pred = y_pred.reshape(1,-1),
                y_true_denorm = y_test_denorm, y_pred_denorm = y_pred_denorm.reshape(1,-1),
                indices=[0])

ID = 2018-12-14 02_23_45.104542


Unnamed: 0,Exp. Var.,R2,MSE,RMSE,nMSE,nRMSE,MSLE,RMSLE,MAPE
0,0.808138,0.806723,69311.553238,263.270874,0.193725,0.440142,0.111982,0.334637,28.616338


In [148]:
print('ID =', model_id)
compute_metrics(y_true = y_test.values, y_pred = y_pred.reshape(1,-1),
                y_true_denorm = y_test_denorm, y_pred_denorm = y_pred_denorm.reshape(1,-1),
                indices=[0])

ID = 2018-12-14 11_54_29.011963


Unnamed: 0,Exp. Var.,R2,MSE,RMSE,nMSE,nRMSE,MSLE,RMSLE,MAPE
0,0.807199,0.807113,69171.445522,263.004649,0.193333,0.439697,0.10625,0.32596,26.570978


In [153]:
print('ID =', model_id)
compute_metrics(y_true = y_test.values, y_pred = y_pred.reshape(1,-1),
                y_true_denorm = y_test_denorm, y_pred_denorm = y_pred_denorm.reshape(1,-1),
                indices=[0])

ID = 2018-12-14 18_09_28.892350


Unnamed: 0,Exp. Var.,R2,MSE,RMSE,nMSE,nRMSE,MSLE,RMSLE,MAPE
0,0.809009,0.808963,68508.101186,261.740523,0.191479,0.437583,0.105644,0.325028,26.714438
