# Using Deep Learning to Predict Traffic Flow

Here, we use multivariate time series to predict short-term traffic.

# Import Libraries

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";

# The GPU id to use, usually either "0" or "1";
os.environ["CUDA_VISIBLE_DEVICES"]="0";  

In [7]:
import time
import sys
from configparser import ConfigParser
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error


sys.path.append('../')

ModuleNotFoundError: No module named 'statsmodels.api'

In [6]:
# from process_traffic_data import apply_custom_transformations
import src.data_processing.process_utils as utils
from src.omnisci_connector.omni_connect import OmnisciConnect
from src import train_utils


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


ModuleNotFoundError: No module named 'statsmodels.api'

In [20]:
from tensorflow.python.keras import optimizers
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, Activation

In [1]:
config_path = '../config.ini'
print("read configuration file %s" %config_path)
config = ConfigParser()
config.read(config_path)
print("Configuration file read.")


read configuration file ../config.ini


NameError: name 'ConfigParser' is not defined

# Configure and connect to OmniSci

In [5]:
import pymapd

con = pymapd.connect(user="abraham", 
                     password="abraham", 
                     host="localhost", 
                     dbname="abraham", 
                     port=6273,
                     protocol='http')

print(con)

Connection(mapd://abraham:***@http://localhost:6273/abraham?protocol=http)


## test out omnisci handle

In [6]:
con.get_tables()

['caltrans_traffic_janfeb_notencoded_nokey',
 'darksky_weather_janfeb',
 'ncdc_weather_rawdata',
 'ncdc_weather_janfeb_intkey',
 'ncdc_weather_janfeb_strkey',
 'caltrans_traffic_janfeb_encoded_strkey',
 'ncdc_weather_janfeb_dictstrkey',
 'ncdc_weather_janfeb_sanfrancisco_metatable',
 'weather_traffic_janfeb_joined',
 'traffic_weather_janfeb_joined_correcttypes_2',
 'joined_traffic_weather_janfeb_correcttypes',
 'caltrans_historic_2015_2019',
 'caltrans_traffic_d04_metatable_weatherID',
 'ncdc_meta_clean',
 'ncdc_weather_clean_190511',
 'traffic_and_weather_190513',
 'predicted_traffic_weather',
 'predicted_traffic_weather_190516_0000',
 'test_2']

## Bring in DataFrame from OmniSci using pymapd

You can use the method shown below to query or directly write it as a string

In [7]:

table_name = "caltrans_historic_2015_2019"

cols = "timestamp_, \
station, \
direction, \
freeway, \
occupancy, \
speed "

condition = "WHERE timestamp_ >= '2019-01-01 00:00' \
AND timestamp_ <  '2019-01-02 00:00'"

# condition = "WHERE timestamp_ >= '2019-01-01 00:00'"


query = "select " + cols + " from " + table_name + " " + condition

print(query)

select timestamp_, station, direction, freeway, occupancy, speed  from caltrans_historic_2015_2019 WHERE timestamp_ >= '2019-01-01 00:00' AND timestamp_ <  '2019-01-02 00:00'


#### Send SQL query to OmniSci and get back a pandas dataframe

In [8]:
df_Omnisci = con.select_ipc(query)

df_Omnisci.head()

Unnamed: 0,timestamp_,station,direction,freeway,occupancy,speed
0,2019-01-01,6784,S,101,0.0026,67.900002
1,2019-01-01,6785,N,101,0.0142,71.900002
2,2019-01-01,6786,S,101,0.0373,70.699997
3,2019-01-01,6790,S,880,0.0271,68.0
4,2019-01-01,6791,N,101,0.0233,72.599998


# Traffic Section
## Data Preparation

In [9]:
# Keep only necessary columns. Make sure that the column you'd like to predict is written in predict_col
# (in this case, it would be speed)
cols = ['station','timestamp_','occupancy','speed']

predict_col = 'speed'

df = (df_Omnisci[cols].set_index(['station','timestamp_'])
                      .sort_values(['station','timestamp_']))

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,occupancy,speed
station,timestamp_,Unnamed: 2_level_1,Unnamed: 3_level_1
6784,2019-01-01 00:00:00,0.0026,67.900002
6784,2019-01-01 01:00:00,0.0009,67.699997
6784,2019-01-01 02:00:00,0.0003,67.5
6784,2019-01-01 03:00:00,0.0002,69.400002
6784,2019-01-01 04:00:00,0.0001,70.099998


## Prepare data for model

In [None]:
#define how many timesteps to look back as input with the variable n_lag.
n_lag = 12

#define how many timesteps ahead to predict with the variable n_steps.
n_steps = 6

reframed, key, scaled, scaler1 = train_utils.format_model_data(df, n_lag, n_steps)

reframed = train_utils.remove_cols(reframed,key[predict_col],n_steps)

#reframed contains the data in the correct format for the model
reframed.head()


## Prepare Model

### Define splits

In [None]:
training_ratio = 0.6
validation_ratio = 0.2
train_X, train_y, val_X, val_y, test_X, test_y = train_utils.split_data(reframed, training_ratio, validation_ratio)

## Create Model

In [None]:
# Traffic LSTM model. You can change it to however you'd like

traffic_model = Sequential()

(traffic_model.add(LSTM(input_shape=(train_X.shape[1], train_X.shape[2]), units=75, return_sequences=True))
 .add(Dropout(0.3))
 .add(LSTM(units=150, return_sequences=False))
 .add(Dropout(0.2))
 .add(Dense(units=1)))

traffic_model.compile(loss='mse', optimizer='adam')


## fit the model to the traffic data

In [None]:
traffic_prediction = traffic_model.fit(train_X, train_y, epochs=30, batch_size=30000, validation_data=(val_X, val_y), verbose=2, shuffle=True)


In [None]:
# plot history
plt.plot(traffic_prediction.history['loss'], label='train')
plt.plot(traffic_prediction.history['val_loss'], label='test')
plt.legend()
plt.show()

## Calculate Accuracy

In [None]:
# make a prediction
yhat = traffic_model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))


# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler1.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler1.inverse_transform(inv_y)
inv_y = inv_y[:,0]

import math
# calculate RMSE
rmse = math.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

## Save Traffic Model

In [None]:
# save model to use later
save_t_model = True

if save_t_model:
    traffic_model.save('../models/traffic_model.h5')
    del traffic_model


# Weather Section
## Data Preparation

In [10]:
# read in traffic metadata from omnisci:

query_traffic_meta = "select * from caltrans_traffic_d04_metatable_weatherID"

df_traffic_metadata = con.select_ipc(query_traffic_meta)
print("Dataframe shape: ",df_traffic_metadata.shape)

traffic_weather_key = df_traffic_metadata[['id','weather_station_id']].set_index('id')


Dataframe shape:  (4333, 8)


In [11]:
df=df.sort_values(by=['timestamp_'])

df_withWeatherID = df.join(traffic_weather_key, on='station')

In [12]:
df_withWeatherID.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,occupancy,speed,weather_station_id
station,timestamp_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6784,2019-01-01,0.0026,67.900002,135
16927,2019-01-01,0.0081,69.900002,23293
7257,2019-01-01,0.0174,69.300003,23234
9010,2019-01-01,0.0373,65.099998,93228
11228,2019-01-01,0.028,64.900002,23293


In [13]:
# read in weather data from omnisci:

cols = "timestamp_, \
hourlyprecipitation, \
hourlyvisibility, \
hourlywindspeed, \
weather_station_id"

query_weather = "select "+ cols + " from ncdc_weather_clean_190511"

df_weather = con.select_ipc(query_weather)
print("Dataframe shape: ",df_weather.shape)

Dataframe shape:  (71159, 5)


In [14]:
# sort values to prepare for merge_asof function. does not work otherwise
join_key = ['timestamp_']

df_withWeatherID = df_withWeatherID.reset_index('station')
df_withWeatherID = df_withWeatherID.sort_values(by=['timestamp_'])
df_weather = df_weather.sort_values(by=['timestamp_'])

df_traffic_weather = pd.merge_asof(left=df_withWeatherID,
                      right= df_weather,
                      on=join_key,
                      by='weather_station_id',
                          direction='nearest')

# Get rid of unnecessary columns and sort by station and time
data_cols = ['station','timestamp_','occupancy','speed','hourlyprecipitation', 'hourlyvisibility', 'hourlywindspeed']

df_traffic_weather = (df_traffic_weather[data_cols].set_index(['station','timestamp_'])
                      .sort_values(['station','timestamp_']))

df_traffic_weather.head()

Unnamed: 0,station,timestamp_,occupancy,speed,weather_station_id,hourlyprecipitation,hourlyvisibility,hourlywindspeed
0,6784,2019-01-01,0.0026,67.900002,135,0.0,10.0,10
1,11702,2019-01-01,0.0443,49.5,23272,0.0,0.0,0
2,7479,2019-01-01,0.0324,65.400002,23293,0.0,10.0,8
3,8674,2019-01-01,0.0191,70.900002,23293,0.0,10.0,8
4,16452,2019-01-01,0.0107,67.5,23213,0.0,10.0,8


## Prepare data for model

In [17]:
#define how many timesteps to look back as input with the variable n_lag.
n_lag = 12

#define how many timesteps ahead to predict with the variable n_steps.
n_steps = 6

reframed_w, key_w, scaled_w, scaler1_w = train_utils.format_model_data(df_traffic_weather, n_lag, n_steps)

reframed_w = train_utils.remove_cols(reframed_w,key_w[predict_col],n_steps)

#reframed contains the data in the correct format for the model
reframed_w.head()


Unnamed: 0,var1(t-12),var2(t-12),var3(t-12),var4(t-12),var5(t-12),var1(t-11),var2(t-11),var3(t-11),var4(t-11),var5(t-11),...,var2(t-2),var3(t-2),var4(t-2),var5(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var2(t+5)
0,0.05144,0.788462,0.0,0.0,0.909091,0.016461,0.769231,0.0,0.0,0.727273,...,0.798077,0.0,0.0,0.0,0.779835,0.836539,0.0,0.0,0.0,0.807692
1,0.016461,0.769231,0.0,0.0,0.727273,0.004115,0.75,0.0,0.0,0.818182,...,0.836539,0.0,0.0,0.0,0.890946,0.076923,0.0,0.0,0.454545,0.798077
2,0.004115,0.75,0.0,0.0,0.818182,0.002058,0.932693,0.0,0.0,0.545455,...,0.076923,0.0,0.0,0.454545,0.950617,0.038461,0.0,0.0,0.636364,0.817308
3,0.002058,0.932693,0.0,0.0,0.545455,0.0,1.0,0.0,0.0,0.818182,...,0.038461,0.0,0.0,0.636364,1.0,0.0,0.0,0.0,1.0,0.855769
4,0.0,1.0,0.0,0.0,0.818182,0.004115,0.961538,0.0,0.0,0.818182,...,0.0,0.0,0.0,1.0,0.923868,0.019231,0.0,0.0,0.545455,0.836539


## Prepare Model

Unnamed: 0_level_0,Unnamed: 1_level_0,occupancy,speed,hourlyprecipitation,hourlyvisibility,hourlywindspeed
station,timestamp_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6784,2019-01-01 00:00:00,0.0026,67.900002,0.0,10.0,10
6784,2019-01-01 01:00:00,0.0009,67.699997,0.0,10.0,8
6784,2019-01-01 02:00:00,0.0003,67.5,0.0,10.0,9
6784,2019-01-01 03:00:00,0.0002,69.400002,0.0,10.0,6
6784,2019-01-01 04:00:00,0.0001,70.099998,0.0,10.0,9


### Define splits

In [18]:
training_ratio = 0.6
validation_ratio = 0.2
train_X, train_y, val_X, val_y, test_X, test_y = train_utils.split_data(reframed_w, training_ratio, validation_ratio)

Size of training set: 10113
Size of Validation set: 3371
Size of Test set: 3372


In [None]:
# Traffic and weather LSTM model. You can change it to however you'd like

traffic_weather_model = Sequential()

(traffic_weather_model.add(LSTM(input_shape=(train_X.shape[1], train_X.shape[2]), units=75, return_sequences=True))
 .add(Dropout(0.3))
 .add(LSTM(units=150, return_sequences=False))
 .add(Dropout(0.2))
 .add(Dense(units=1)))

traffic_weather_model.compile(loss='mse', optimizer='adam')


W0603 13:16:08.824127 140001036715776 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.LSTM object at 0x7f53e6a9b438>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


## fit the model to traffic and weather data

In [None]:
traffic_weather_prediction = traffic_weather_model.fit(train_X, train_y, epochs=30, batch_size=30000, validation_data=(val_X, val_y), verbose=2, shuffle=True)


In [None]:
# plot history
plt.plot(traffic_weather_prediction.history['loss'], label='train')
plt.plot(traffic_weather_prediction.history['val_loss'], label='test')
plt.legend()
plt.show()

## Calculate traffic and weather Accuracy

In [None]:
# make a prediction
yhat = traffic_weather_prediction.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))


# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler1.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler1.inverse_transform(inv_y)
inv_y = inv_y[:,0]

import math
# calculate RMSE
rmse = math.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

# Save traffic and weather model

In [None]:
# save model to use later
save_w_model = True

if save_w_model:
    traffic_weather_model.save('../models/traffic_weather_model.h5')
    del traffic_weather_model
