# Using Deep Learning to Predict Traffic Flow

Here, we use multivariate time series to predict the how traffic will be.

Could be part of the talk in Budapest

# Import Libraries

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";

# The GPU id to use, usually either "0" or "1";
os.environ["CUDA_VISIBLE_DEVICES"]="0";  

In [2]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 10360800414928123767, name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 3610686048121791641
 physical_device_desc: "device: XLA_GPU device", name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 495436814135965460
 physical_device_desc: "device: XLA_CPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 22664403354
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16901272306500359134
 physical_device_desc: "device: 0, name: Tesla P40, pci bus id: 099f:00:00.0, compute capability: 6.1"]

In [3]:
from tensorflow.python.keras import optimizers

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, Activation

In [4]:
import time
import sys
from configparser import ConfigParser
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm


sys.path.append('/home/mapdadmin/abraham/caltrans-data-exploration/')

In [5]:
from process_traffic_data import apply_custom_transformations
import data_processing.process_utils as utils
from omnisci_connector.omni_connect import OmnisciConnect
from training import train_utils


Exiting Main Thread
total time:  7.05718994140625e-05


In [6]:
config_path = '/home/mapdadmin/abraham/ini_files/config.ini'
print("read configuration file %s" %config_path)
config = ConfigParser()
config.read(config_path)
print("Configuration file read.")


read configuration file /home/mapdadmin/abraham/ini_files/config.ini
Configuration file read.


In [7]:
print("connect to omnisci")
OmnisciHandle = OmnisciConnect(config_path)
OmnisciHandle.start_connection()
OmnisciHandle.con

connect to omnisci


Connection(mapd://abraham:***@http://localhost:6273/abraham?protocol=http)

In [8]:
table_name = "caltrans_historic_2015_2019"

cols = "timestamp_, \
station, \
direction, \
freeway, \
occupancy, \
speed "

condition = "WHERE timestamp_ >= '2019-02-01 00:00' \
AND timestamp_ <  '2019-03-01 00:00' \
AND freeway = 101"

# condition = "WHERE timestamp_ >= '2019-01-01 00:00'"


query = "select " + cols + " from " + table_name + " " + condition

print(query)

select timestamp_, station, direction, freeway, occupancy, speed  from caltrans_historic_2015_2019 WHERE timestamp_ >= '2019-02-01 00:00' AND timestamp_ <  '2019-03-01 00:00' AND freeway = 101


In [9]:
df_Omnisci = OmnisciHandle.con.select_ipc(query)
print("Dataframe shape: ",df_Omnisci.shape)
print("summary of nan's")
print(df_Omnisci.isna().sum())

Dataframe shape:  (6267579, 6)
summary of nan's
timestamp_    0
station       0
direction     0
freeway       0
occupancy     0
speed         0
dtype: int64


In [10]:
df_Omnisci.head()

Unnamed: 0,timestamp_,station,direction,freeway,occupancy,speed
0,2019-02-12,6784,S,101,0.0005,65.900002
1,2019-02-12,6785,N,101,0.0046,70.400002
2,2019-02-12,6786,S,101,0.0233,72.199997
3,2019-02-12,6791,N,101,0.0204,72.400002
4,2019-02-12,6794,N,101,0.0168,68.0


In [11]:
df_Omnisci = df_Omnisci.sort_values('timestamp_')

In [12]:
# freeway 101
#Select highway 101N
cols = ['station','timestamp_','occupancy','speed']

df_predict = (df_Omnisci[cols].set_index(['station','timestamp_'])
                      .sort_values(['station','timestamp_']))

df_predict.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,occupancy,speed
station,timestamp_,Unnamed: 2_level_1,Unnamed: 3_level_1
6784,2019-02-01 00:00:00,0.0,69.0
6784,2019-02-01 00:00:00,0.0,69.0
6784,2019-02-01 00:05:00,0.0,69.0
6784,2019-02-01 00:10:00,0.0,69.0
6784,2019-02-01 00:15:00,0.0,69.0


# Prepare Data

In [13]:
def data_index(df, n_lag, n_steps):
    
    steps = n_lag + n_steps - 1
    
    return df.iloc[steps:].index

In [14]:
n_lag = 12
n_steps = 6
reframed, key, scaled, scaler1 = train_utils.format_model_data(df_predict, n_lag, n_steps)

reframed.head()

Unnamed: 0,var1(t-12),var2(t-12),var1(t-11),var2(t-11),var1(t-10),var2(t-10),var1(t-9),var2(t-9),var1(t-8),var2(t-8),...,var1(t+1),var2(t+1),var1(t+2),var2(t+2),var1(t+3),var2(t+3),var1(t+4),var2(t+4),var1(t+5),var2(t+5)
0,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
1,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
2,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
3,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
4,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088


In [15]:
index_out = []
for station, new_df in df_predict.groupby(level=0):
    print(station)
    index_ = list(data_index(new_df, n_lag, n_steps))
    
    index_out.append(index_)
    
the_index = [item for sublist in index_out for item in sublist]


6784
6785
6786
6791
6794
6795
6798
6823
6827
6853
6882
6890
6891
6893
6900
6903
6909
6922
6927
6933
6944
6952
6956
6965
6967
6979
6990
6998
7007
7011
7021
7030
7039
7044
7047
7048
7056
7067
7075
7088
7100
7109
7114
7115
7124
7133
7137
7156
7165
7166
7172
7174
7176
7178
7184
7204
7205
7209
7212
7219
7223
7224
7225
7244
7257
7258
7268
7281
7287
7297
7309
7311
7315
7324
7329
7331
7352
7362
7370
7375
7377
7382
7404
7405
7414
7425
7427
7429
7440
7441
7445
7452
7504
7511
7514
7517
7526
7527
7528
7537
7544
7547
7550
7552
7565
7566
7567
7568
7571
7574
7578
7581
7588
7601
7642
7643
7647
7652
7668
7670
7675
7679
7688
7695
7706
7719
7746
7748
7749
7759
7761
7762
7765
7780
7784
7983
8017
8061
8069
8120
8141
8142
8189
8192
8193
8194
8195
8223
8227
8230
8232
8235
8238
8246
8251
8253
8256
8258
8285
8291
8299
8300
8307
8318
8362
8363
8365
8370
8373
8376
8379
8382
8383
8385
8386
8387
8388
8389
8393
8400
8408
8415
8418
8433
8435
8436
8437
8442
8444
8445
8446
8447
8452
8453
8454
8475
8512
8542
8550
8579


In [16]:
the_index[:10]

[(6784, Timestamp('2019-02-01 01:15:00')),
 (6784, Timestamp('2019-02-01 01:20:00')),
 (6784, Timestamp('2019-02-01 01:25:00')),
 (6784, Timestamp('2019-02-01 01:30:00')),
 (6784, Timestamp('2019-02-01 01:35:00')),
 (6784, Timestamp('2019-02-01 01:40:00')),
 (6784, Timestamp('2019-02-01 01:45:00')),
 (6784, Timestamp('2019-02-01 01:50:00')),
 (6784, Timestamp('2019-02-01 01:55:00')),
 (6784, Timestamp('2019-02-01 02:00:00'))]

In [17]:
assert reframed.shape[0] == len(the_index)

In [18]:
print(key)

{'occupancy': 'var1', 'speed': 'var2'}


In [19]:
# TO-DO: FIX HOW THIS WORKS, VERY MANUAL AS OF NOW
#drop_cols = []
cols = list(reframed.columns)

drop_1 = [c for c in cols if '(t+' in c]
drop_2 = [c for c in cols if '(t)' in c]

drop_1.remove('var2(t+5)')

drop_cols = drop_1 + drop_2

print(drop_cols)

['var1(t+1)', 'var2(t+1)', 'var1(t+2)', 'var2(t+2)', 'var1(t+3)', 'var2(t+3)', 'var1(t+4)', 'var2(t+4)', 'var1(t+5)', 'var1(t)', 'var2(t)']


In [20]:
reframed.head(10)

Unnamed: 0,var1(t-12),var2(t-12),var1(t-11),var2(t-11),var1(t-10),var2(t-10),var1(t-9),var2(t-9),var1(t-8),var2(t-8),...,var1(t+1),var2(t+1),var1(t+2),var2(t+2),var1(t+3),var2(t+3),var1(t+4),var2(t+4),var1(t+5),var2(t+5)
0,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
1,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
2,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
3,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
4,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
5,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
6,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
7,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
8,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088
9,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,...,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.0,0.79088,0.001549,0.809748


In [21]:
reframed.drop(drop_cols, axis=1, inplace=True)

In [22]:
reframed.shape

(6254965, 25)

In [23]:
reframed.columns

Index(['var1(t-12)', 'var2(t-12)', 'var1(t-11)', 'var2(t-11)', 'var1(t-10)',
       'var2(t-10)', 'var1(t-9)', 'var2(t-9)', 'var1(t-8)', 'var2(t-8)',
       'var1(t-7)', 'var2(t-7)', 'var1(t-6)', 'var2(t-6)', 'var1(t-5)',
       'var2(t-5)', 'var1(t-4)', 'var2(t-4)', 'var1(t-3)', 'var2(t-3)',
       'var1(t-2)', 'var2(t-2)', 'var1(t-1)', 'var2(t-1)', 'var2(t+5)'],
      dtype='object')

# Predict traffic

In [24]:

#Data
test = reframed.values


# split into input and outputs
test_X, test_y = test[:, :-1], test[:, -1]

test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(test_X.shape, test_y.shape)

(6254965, 1, 24) (6254965,)


In [26]:
from tensorflow.python.keras.models import load_model

my_model = load_model('../models/traffic_190513_2300.h5')

W0513 23:44:52.640450 140670785935104 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7fedec79e160>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0513 23:44:52.906638 140670785935104 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7fee93074b38>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


In [None]:
# make a prediction
yhat = my_model.predict(test_X)
print('yhat predicted')
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))


# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler1.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler1.inverse_transform(inv_y)
inv_y = inv_y[:,0]

import math
# calculate RMSE
rmse = math.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(20, 6), dpi=80, facecolor='w', edgecolor='k')

plt.plot(inv_y)
plt.plot(inv_yhat)

plt.show()

In [None]:
from matplotlib.pyplot import figure

figure(num=None, figsize=(20, 6), dpi=80, facecolor='y', edgecolor='k')

week_num = 0

num_weeks = 1

plt.plot(inv_y[week_num*7*24:week_num*7*24 + num_weeks*(7*24)])
plt.plot(inv_yhat[week_num*7*24:week_num*7*24 + num_weeks*(7*24)])

In [None]:
figure(num=None, figsize=(20, 6), dpi=80, facecolor='y', edgecolor='k')

week_num = 0

num_weeks = 1

plt.plot(inv_y[400:500])
plt.plot(inv_yhat[400:500])

In [None]:
# save model to use later
save_t_model = True

if save_t_model:
    my_model.save('../models/190511_1539.h5')
    
del my_model