# Using Deep Learning to Predict Traffic Flow

Here, we use multivariate time series to predict the how traffic will be.

Could be part of the talk in Budapest

# Import Libraries

In [1]:
import os
from tensorflow.python.client import device_lib

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";  

In [2]:
from tensorflow.python.keras import optimizers
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, Activation
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 14288926065768353170, name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 7441235527448443654
 physical_device_desc: "device: XLA_GPU device", name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 15366897165287552057
 physical_device_desc: "device: XLA_CPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 22508008244
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 5684342040079447989
 physical_device_desc: "device: 0, name: Tesla P40, pci bus id: 099f:00:00.0, compute capability: 6.1"]

In [3]:
import time
import sys
from configparser import ConfigParser
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

sys.path.append('/home/mapdadmin/abraham/caltrans-data-exploration/')

In [4]:
from process_traffic_data import apply_custom_transformations
import data_processing.process_utils as utils
from training import train_utils
from omnisci_connector.omni_connect import OmnisciConnect


Exiting Main Thread
total time:  9.107589721679688e-05


In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error


In [6]:
config_path = '/home/mapdadmin/abraham/ini_files/config.ini'
print("read configuration file %s" %config_path)
config = ConfigParser()
config.read(config_path)
print("Configuration file read.")

print("connect to omnisci")
OmnisciHandle = OmnisciConnect(config_path)
OmnisciHandle.start_connection()
OmnisciHandle.con

read configuration file /home/mapdadmin/abraham/ini_files/config.ini
Configuration file read.
connect to omnisci


Connection(mapd://abraham:***@http://localhost:6273/abraham?protocol=http)

In [7]:
table_name = "caltrans_historic_2015_2019"

cols = "timestamp_, \
station, \
freeway, \
occupancy, \
speed "

condition = "WHERE timestamp_ >= '2019-01-01 00:00' \
AND timestamp_ <  '2019-02-01 00:00'"

# condition = "WHERE timestamp_ >= '2019-01-01 00:00'"


query = "select " + cols + " from " + table_name + " " + condition

print(query)

select timestamp_, station, freeway, occupancy, speed  from caltrans_historic_2015_2019 WHERE timestamp_ >= '2019-01-01 00:00' AND timestamp_ <  '2019-02-01 00:00' AND freeway = 101


In [8]:
df_Omnisci = OmnisciHandle.con.select_ipc(query)
print("Dataframe shape: ",df_Omnisci.shape)
print("summary of nan's")
print(df_Omnisci.isna().sum())

Dataframe shape:  (6107278, 5)
summary of nan's
timestamp_    0
station       0
freeway       0
occupancy     0
speed         0
dtype: int64


In [9]:
df_Omnisci.head()

Unnamed: 0,timestamp_,station,freeway,occupancy,speed
0,2019-01-24,6784,101,0.0,69.0
1,2019-01-24,6785,101,0.0077,71.400002
2,2019-01-24,6786,101,0.0242,71.800003
3,2019-01-24,6791,101,0.0178,72.400002
4,2019-01-24,6794,101,0.0204,69.0


# Prepare Model

## Including Weather

# Read in weather data from Omnisci

In [10]:
# read in traffic metadata from omnisci:

query_traffic_meta = "select * from caltrans_traffic_d04_metatable_weatherID"

df_traffic_metadata = OmnisciHandle.con.select_ipc(query_traffic_meta)
print("Dataframe shape: ",df_traffic_metadata.shape)


Dataframe shape:  (4333, 8)


In [11]:
# read in weather data from omnisci:

cols = "timestamp_, \
hourlydrybulbtemperature, \
hourlyprecipitation, \
hourlyrelativehumidity, \
hourlyvisibility, \
hourlywindspeed, \
weather_station_id"

query_weather = "select "+ cols + " from ncdc_weather_clean_190511"
# query_weather = "select * from ncdc_weather_clean_190511"

df_weather = OmnisciHandle.con.select_ipc(query_weather)
print("Dataframe shape: ",df_weather.shape)

Dataframe shape:  (71159, 7)


In [12]:
df_101Weather = df_Omnisci.set_index('timestamp_')

In [13]:
traffic_tojoin = df_traffic_metadata[['id','weather_station_id']].set_index('id')
df_101WeatherID_joined = df_101Weather.join(traffic_tojoin, on='station')

# Join weather and Traffic Data

In [14]:
# pick important weather things:

important_weather_columns = ['timestamp_','weather_station_id','hourlyprecipitation','hourlyvisibility','hourlywindspeed']

important_weather_data = df_weather[important_weather_columns]

In [15]:
# sort values to prepare for merge_asof function. does not work otherwise
df_101WeatherID_joined=df_101WeatherID_joined.sort_values(by=['timestamp_'])
important_weather_data = important_weather_data.sort_values(by=['timestamp_'])

In [16]:
join_key = ['timestamp_']
df_101_all = pd.merge_asof(left=df_101WeatherID_joined,
                      right=important_weather_data,
                      on=join_key,
                      by='weather_station_id',
                          direction='nearest')

In [17]:
data_cols = ['station','timestamp_','occupancy','speed','hourlyprecipitation','hourlyvisibility','hourlywindspeed']
df_traffic_weather = (df_101_all[data_cols].set_index(['station','timestamp_'])
                      .sort_values(['station','timestamp_']))

df_traffic_weather.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,occupancy,speed,hourlyprecipitation,hourlyvisibility,hourlywindspeed
station,timestamp_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6784,2019-01-01 00:00:00,0.0026,67.900002,0.0,10.0,10
6784,2019-01-01 01:00:00,0.0009,67.699997,0.0,10.0,8
6784,2019-01-01 02:00:00,0.0003,67.5,0.0,10.0,9
6784,2019-01-01 03:00:00,0.0002,69.400002,0.0,10.0,6
6784,2019-01-01 04:00:00,0.0001,70.099998,0.0,10.0,9


In [20]:
n_lag = 12
n_steps = 6
reframed, key, scaled, scaler1 = train_utils.format_model_data(df_train, n_lag, n_steps)

reframed.head()

Unnamed: 0,var1(t-12),var2(t-12),var3(t-12),var4(t-12),var5(t-12),var1(t-11),var2(t-11),var3(t-11),var4(t-11),var5(t-11),...,var1(t+4),var2(t+4),var3(t+4),var4(t+4),var5(t+4),var1(t+5),var2(t+5),var3(t+5),var4(t+5),var5(t+5)
0,0.006232,0.836039,0.0,0.04,0.277778,0.002157,0.832792,0.0,0.04,0.222222,...,0.097315,0.784091,0.0,0.04,0.166667,0.089406,0.839286,0.0,0.04,0.083333
1,0.002157,0.832792,0.0,0.04,0.222222,0.000719,0.829545,0.0,0.04,0.25,...,0.089406,0.839286,0.0,0.04,0.083333,0.076702,0.837662,0.0,0.04,0.0
2,0.000719,0.829545,0.0,0.04,0.25,0.000479,0.86039,0.0,0.04,0.166667,...,0.076702,0.837662,0.0,0.04,0.0,0.059923,0.840909,0.0,0.04,0.0
3,0.000479,0.86039,0.0,0.04,0.166667,0.00024,0.871753,0.0,0.04,0.25,...,0.059923,0.840909,0.0,0.04,0.0,0.049616,0.847403,0.0,0.04,0.0
4,0.00024,0.871753,0.0,0.04,0.25,0.000719,0.86526,0.0,0.04,0.25,...,0.049616,0.847403,0.0,0.04,0.0,0.028523,0.844156,0.0,0.04,0.0


In [21]:
key

{'occupancy': 'var1',
 'speed': 'var2',
 'hourlyprecipitation': 'var3',
 'hourlyvisibility': 'var4',
 'hourlywindspeed': 'var5'}

In [22]:
# TO-DO: FIX HOW THIS WORKS, VERY MANUAL AS OF NOW
#drop_cols = []
cols = list(reframed.columns)

drop_1 = [c for c in cols if '(t+' in c]
drop_2 = [c for c in cols if '(t)' in c]

drop_1.remove('var2(t+5)')

drop_cols = drop_1 + drop_2

print(drop_cols)

['var1(t+1)', 'var2(t+1)', 'var3(t+1)', 'var4(t+1)', 'var5(t+1)', 'var1(t+2)', 'var2(t+2)', 'var3(t+2)', 'var4(t+2)', 'var5(t+2)', 'var1(t+3)', 'var2(t+3)', 'var3(t+3)', 'var4(t+3)', 'var5(t+3)', 'var1(t+4)', 'var2(t+4)', 'var3(t+4)', 'var4(t+4)', 'var5(t+4)', 'var1(t+5)', 'var3(t+5)', 'var4(t+5)', 'var5(t+5)', 'var1(t)', 'var2(t)', 'var3(t)', 'var4(t)', 'var5(t)']


In [23]:
reframed.drop(drop_cols, axis=1, inplace=True)

In [24]:
# define split
train_ratio = 0.9
val_ratio = 0.1

train_val = int(reframed.shape[0] * train_ratio)

val_test = train_val + int(reframed.shape[0] * val_ratio)

print("Size of training set:", train_val)
print("Size of Validation set:", val_test-train_val)
print("Size of Testing set:", reframed.shape[0]-val_test)
#define number of steps in to the future

print(reframed.shape)

Size of training set: 5485197
Size of Validation set: 609466
Size of Testing set: 1
(6094664, 61)


In [25]:

#Data
values = reframed.values
train = values[:train_val, :]
val = values[train_val:val_test, :]
test = values[val_test:, :]


# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
val_X, val_y = val[:, :-1], val[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(5485197, 1, 60) (5485197,) (1, 1, 60) (1,)


In [26]:
# The LSTM model
my_model = Sequential()

my_model.add(LSTM(input_shape=(train_X.shape[1], train_X.shape[2]), units=75, return_sequences=True))
my_model.add(Dropout(0.3))

my_model.add(LSTM(units=150, return_sequences=False))
my_model.add(Dropout(0.2))

my_model.add(Dense(units=1))
my_model.add(Activation('linear'))

opt = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
my_model.compile(loss='mse', optimizer='adam')


W0513 19:34:37.307110 140533743929088 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.LSTM object at 0x7fd070343b00>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0513 19:34:41.256218 140533743929088 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.LSTM object at 0x7fd0703be4a8>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


In [27]:
history = my_model.fit(train_X, train_y, epochs=30, batch_size=30000, validation_data=(val_X, val_y), verbose=2, shuffle=True)

#history = my_model.fit(train_X, train_y, epochs=50, batch_size=50000, validation_split=0.2, verbose=2, shuffle=True)




Train on 5485197 samples, validate on 609466 samples
Epoch 1/30
5485197/5485197 - 27s - loss: 0.0498 - val_loss: 0.0062
Epoch 2/30
5485197/5485197 - 23s - loss: 0.0119 - val_loss: 0.0054
Epoch 3/30
5485197/5485197 - 23s - loss: 0.0099 - val_loss: 0.0052
Epoch 4/30
5485197/5485197 - 22s - loss: 0.0091 - val_loss: 0.0052
Epoch 5/30
5485197/5485197 - 22s - loss: 0.0086 - val_loss: 0.0052
Epoch 6/30
5485197/5485197 - 21s - loss: 0.0084 - val_loss: 0.0052
Epoch 7/30
5485197/5485197 - 22s - loss: 0.0082 - val_loss: 0.0052
Epoch 8/30
5485197/5485197 - 22s - loss: 0.0080 - val_loss: 0.0051
Epoch 9/30
5485197/5485197 - 22s - loss: 0.0079 - val_loss: 0.0051
Epoch 10/30
5485197/5485197 - 21s - loss: 0.0078 - val_loss: 0.0051
Epoch 11/30
5485197/5485197 - 21s - loss: 0.0076 - val_loss: 0.0050
Epoch 12/30
5485197/5485197 - 21s - loss: 0.0075 - val_loss: 0.0050
Epoch 13/30
5485197/5485197 - 21s - loss: 0.0074 - val_loss: 0.0050
Epoch 14/30
5485197/5485197 - 22s - loss: 0.0074 - val_loss: 0.0050
Epoc

In [1]:
# plot history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

NameError: name 'plt' is not defined

In [None]:
# make a prediction
yhat = my_model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))


# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler1.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler1.inverse_transform(inv_y)
inv_y = inv_y[:,0]

import math
# calculate RMSE
rmse = math.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [28]:
# save model to use later
save_t_model = True

if save_t_model:
    my_model.save('../models/190513_2130_TrafficAndWeather_final.h5')
    
del my_model

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(20, 6), dpi=80, facecolor='w', edgecolor='k')

plt.plot(inv_y)
plt.plot(inv_yhat)
plt.show()

In [None]:
from matplotlib.pyplot import figure

figure(num=None, figsize=(20, 6), dpi=80, facecolor='y', edgecolor='k')

week_num = 0

num_weeks = 1

plt.plot(inv_y[week_num*7*24:week_num*7*24 + num_weeks*(7*24)])
plt.plot(inv_yhat[week_num*7*24:week_num*7*24 + num_weeks*(7*24)])

In [None]:
figure(num=None, figsize=(20, 6), dpi=80, facecolor='y', edgecolor='k')

s = slice(400,500)

plt.plot(inv_y[s])
plt.plot(inv_yhat[s])

# Backup stuff

In [None]:
# back up model
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=10, batch_size=50000, validation_data=(val_X, val_y), verbose=2, shuffle=True)
