In [26]:
import pickle as pkl
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from IPython.display import clear_output

import tensorflow as tf
from tensorflow import keras

### 1. Load data

In [6]:
n_chunks = 1
chunks = [pkl.load(open('data/bin_chunks/ttd_chunk_{0}.pkl'.format(i), 'rb')) for i in range(1, n_chunks + 1)]
dataset = pd.concat(chunks)
print("Data shape:", dataset.shape)
dataset.head()

Data shape: (48605, 10)


Unnamed: 0,pu_t,do_t,trip_dist,pu_lon,pu_lat,do_lon,do_lat,duration,vec_dist,trip_ratio
0,2016-02-16 19:56:06,2016-02-16 20:10:46,8.53,-73.99575,40.764488,-73.926292,40.867458,880,7.983998,1.068387
1,2016-02-11 10:23:41,2016-02-11 10:45:04,1.95,-73.99411,40.751068,-73.993401,40.736019,1283,1.039087,1.876648
2,2016-02-21 14:21:57,2016-02-21 14:30:07,0.5,-73.97316,40.752728,-73.982178,40.756962,490,0.55613,0.89907
3,2016-02-11 22:51:59,2016-02-11 22:57:41,1.3,-73.99128,40.744808,-73.985909,40.731178,342,0.981839,1.324045
4,2016-02-13 18:32:08,2016-02-13 18:43:51,1.5,-73.992897,40.74128,-73.975967,40.748734,703,1.026608,1.461123


### 2. Preprocessing

In [10]:
lat_data = np.array(pd.concat((dataset.pu_lat, dataset.do_lat))).reshape(-1,1)
lon_data = np.array(pd.concat((dataset.pu_lon, dataset.do_lon))).reshape(-1,1)
dur_data = np.array(dataset.duration.astype(float)).reshape(-1,1)
    
scaler_type = 'StandardScaler' # Other options: MinMaxScaler
lat_scaler = getattr(preprocessing, scaler_type)()
lon_scaler = getattr(preprocessing, scaler_type)()
out_scaler = getattr(preprocessing, scaler_type)()

lat_scaler.fit(lat_data);
lon_scaler.fit(lon_data);
out_scaler.fit(dur_data);

In [11]:
X = np.hstack((lat_scaler.transform(dataset[['pu_lat','do_lat',]]),
               lon_scaler.transform(dataset[['pu_lon','do_lon',]])))
y = out_scaler.transform(dur_data)

### 3. Splitting data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
test_size = len(y_test)
train_size = len(y_train)

### 4. Build model

In [16]:
def build_model():
    model = keras.Sequential([
    keras.layers.Dense(4, activation  = tf.nn.relu,
                           input_shape = (X_train.shape[1],)),
    keras.layers.Dense(4, activation=tf.nn.relu),
    keras.layers.Dense(1)
    ])

    optimizer = tf.train.RMSPropOptimizer(0.001)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae'])
    return model

model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 5         
Total params: 45
Trainable params: 45
Non-trainable params: 0
_________________________________________________________________


### 5. Training

In [43]:
EPOCHS =6
    
# Store training stats
history = model.fit(X_train, y_train, epochs=EPOCHS,
                    validation_split=0.2, verbose=1, callbacks=[PrintDot()])

Epoch 6/6
{'val_loss': 0.3441006693395072, 'val_mean_absolute_error': 0.3954461666567198, 'loss': 0.3221985506611976, 'mean_absolute_error': 0.3839895018767238}
