# Using Deep Learning to Predict Traffic Flow

Here, we use multivariate time series to predict the how traffic will be.

Could be part of the talk in Budapest

# Import Libraries

In [None]:
import os
from tensorflow.python.client import device_lib

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";  

In [None]:
from tensorflow.python.keras import optimizers
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, Activation
device_lib.list_local_devices()

In [2]:
import time
import sys
from configparser import ConfigParser
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

sys.path.append('/home/mapdadmin/abraham/caltrans-data-exploration/')

In [3]:
from process_traffic_data import apply_custom_transformations
import data_processing.process_utils as utils
from training import train_utils
from omnisci_connector.omni_connect import OmnisciConnect


Exiting Main Thread
total time:  8.535385131835938e-05


In [4]:
config_path = '/home/mapdadmin/abraham/ini_files/config.ini'
print("read configuration file %s" %config_path)
config = ConfigParser()
config.read(config_path)
print("Configuration file read.")

print("connect to omnisci")
OmnisciHandle = OmnisciConnect(config_path)
OmnisciHandle.start_connection()
OmnisciHandle.con

read configuration file /home/mapdadmin/abraham/ini_files/config.ini
Configuration file read.
connect to omnisci


Connection(mapd://abraham:***@http://localhost:6273/abraham?protocol=http)

In [5]:
table_name = "caltrans_historic_2015_2019"

cols = "timestamp_, \
station, \
direction,\
freeway, \
occupancy, \
speed, \
hour_of_day, \
day_of_week, \
longitude, \
latitude"

condition = "WHERE timestamp_ >= '2019-02-01 00:00' \
AND timestamp_ <  '2019-03-01 00:00'"

# condition = "WHERE timestamp_ >= '2019-01-01 00:00'"


query = "select " + cols + " from " + table_name + " " + condition

print(query)

select timestamp_, station, direction,freeway, occupancy, speed, hour_of_day, day_of_week, longitude, latitude from caltrans_historic_2015_2019 WHERE timestamp_ >= '2019-02-01 00:00' AND timestamp_ <  '2019-03-01 00:00'


In [6]:
df_Omnisci = OmnisciHandle.con.select_ipc(query)
print("Dataframe shape: ",df_Omnisci.shape)
print("summary of nan's")
print(df_Omnisci.isna().sum())

Dataframe shape:  (20318748, 10)
summary of nan's
timestamp_     0
station        0
direction      0
freeway        0
occupancy      0
speed          0
hour_of_day    0
day_of_week    0
longitude      0
latitude       0
dtype: int64


In [7]:
df_Omnisci['direction'] =  df_Omnisci['direction'].astype('object')
df_Omnisci['day_of_week'] =  df_Omnisci['day_of_week'].astype('object')

# Prepare Model

## Including Weather

# Read in weather data from Omnisci

In [8]:
# read in traffic metadata from omnisci:

query_traffic_meta = "select * from caltrans_traffic_d04_metatable_weatherID"

df_traffic_metadata = OmnisciHandle.con.select_ipc(query_traffic_meta)
print("Dataframe shape: ",df_traffic_metadata.shape)


Dataframe shape:  (4333, 8)


In [9]:
# read in weather data from omnisci:

cols = "timestamp_, \
hourlydrybulbtemperature, \
hourlyprecipitation, \
hourlyrelativehumidity, \
hourlyvisibility, \
hourlywindspeed, \
weather_station_id"

query_weather = "select "+ cols + " from ncdc_weather_clean_190511"
# query_weather = "select * from ncdc_weather_clean_190511"

df_weather = OmnisciHandle.con.select_ipc(query_weather)
print("Dataframe shape: ",df_weather.shape)

Dataframe shape:  (71159, 7)


In [10]:
df_101Weather = df_Omnisci.set_index('timestamp_')

In [11]:
traffic_tojoin = df_traffic_metadata[['id','weather_station_id']].set_index('id')
df_101WeatherID_joined = df_101Weather.join(traffic_tojoin, on='station')

# Join weather and Traffic Data

In [12]:
# pick important weather things:

important_weather_columns = ['timestamp_','weather_station_id','hourlyprecipitation','hourlyvisibility','hourlywindspeed']

important_weather_data = df_weather[important_weather_columns]

In [13]:
# sort values to prepare for merge_asof function. does not work otherwise
df_101WeatherID_joined=df_101WeatherID_joined.sort_values(by=['timestamp_'])
important_weather_data = important_weather_data.sort_values(by=['timestamp_'])

In [14]:
join_key = ['timestamp_']
df_101_all = pd.merge_asof(left=df_101WeatherID_joined,
                      right=important_weather_data,
                      on=join_key,
                      by='weather_station_id',
                          direction='nearest')

In [15]:
df_101_all.head()

Unnamed: 0,station,timestamp_,direction,freeway,occupancy,speed,hour_of_day,day_of_week,longitude,latitude,weather_station_id,hourlyprecipitation,hourlyvisibility,hourlywindspeed
0,11680,2019-02-01,E,80,0.0397,71.099998,0,Friday,-122.361572,37.813046,23272,0.0,0.0,0
1,7246,2019-02-01,S,680,0.0016,65.099998,0,Friday,-122.134277,38.182316,93227,0.0,0.0,0
2,7245,2019-02-01,N,85,0.0057,65.699997,0,Friday,-121.878914,37.25478,23293,0.0,0.0,0
3,7244,2019-02-01,S,101,0.0072,68.199997,0,Friday,-122.530891,38.053749,135,0.0,0.0,0
4,7241,2019-02-01,S,87,0.0063,68.300003,0,Friday,-121.863663,37.276764,23293,0.0,0.0,0


In [16]:
df_101_all.dtypes

station                         int16
timestamp_             datetime64[ns]
direction                      object
freeway                         int16
occupancy                     float32
speed                         float32
hour_of_day                     int16
day_of_week                    object
longitude                     float32
latitude                      float32
weather_station_id              int64
hourlyprecipitation           float64
hourlyvisibility              float64
hourlywindspeed                 int64
dtype: object

In [17]:
OmnisciHandle.con.load_table(data=df_101_all,table_name='traffic_and_weather_190513')

In [None]:
data_cols = ['station','timestamp_','occupancy','speed','hourlyprecipitation','hourlyvisibility','hourlywindspeed']
df_traffic_weather = (df_101_all[data_cols].set_index(['station','timestamp_'])
                      .sort_values(['station','timestamp_']))




In [None]:
df_traffic_weather.head()

In [None]:

def scale_data(df):
    # process data
    values = df.values
    key = train_utils.data_key(df)

    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler1 = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(values)

    scaled1 = scaler1.fit_transform(values[:,1].reshape(-1, 1))
    # print(key)
    
    return key, scaled, scaler1

In [None]:
def format_model_data(df, n_lag, n_steps):
    df_out = []
    for station, new_df in df.groupby(level=0):
        key, scaled, scaler1 = scale_data(new_df)
        reframed_ = train_utils.prepare_data_for_network(scaled,n_lag,n_steps)
        df_out.append(reframed_)
        
    return pd.concat(df_out, ignore_index=True), key, scaled, scaler1

In [None]:
n_lag = 12
n_steps = 6
reframed, key, scaled, scaler1 = format_model_data(df_traffic_weather, n_lag, n_steps)

reframed.head()

In [None]:
key

In [None]:
df_traffic_weather.tail()

In [None]:
reframed.tail()

In [None]:
# TO-DO: FIX HOW THIS WORKS, VERY MANUAL AS OF NOW
#drop_cols = []
cols = list(reframed.columns)

drop_1 = [c for c in cols if '(t+' in c]
drop_2 = [c for c in cols if '(t)' in c]

drop_1.remove('var2(t+5)')

drop_cols = drop_1 + drop_2

print(drop_cols)

In [None]:
reframed.drop(drop_cols, axis=1, inplace=True)

In [None]:
# define split
train_ratio = 0.8
val_ratio = 0.2

train_val = int(reframed.shape[0] * train_ratio)

val_test = train_val + int(reframed.shape[0] * val_ratio)

print("Size of training set:", train_val)
print("Size of Validation set:", val_test-train_val)
print("Size of Testing set:", reframed.shape[0]-val_test)
#define number of steps in to the future

print(reframed.shape)

In [None]:

#Data
values = reframed.values
train = values[:train_val, :]
val = values[train_val:val_test, :]
test = values[val_test:, :]


# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
val_X, val_y = val[:, :-1], val[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

In [None]:
# The LSTM model
my_model = Sequential()

my_model.add(LSTM(input_shape=(train_X.shape[1], train_X.shape[2]), units=75, return_sequences=True))
my_model.add(Dropout(0.3))

my_model.add(LSTM(units=150, return_sequences=False))
my_model.add(Dropout(0.2))

my_model.add(Dense(units=1))
my_model.add(Activation('linear'))

opt = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
my_model.compile(loss='mse', optimizer='adam')


In [None]:
history = my_model.fit(train_X, train_y, epochs=50, batch_size=30000, validation_data=(val_X, val_y), verbose=2, shuffle=True)

#history = my_model.fit(train_X, train_y, epochs=50, batch_size=50000, validation_split=0.2, verbose=2, shuffle=True)




In [None]:
# plot history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
# make a prediction
yhat = my_model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))


# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler1.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler1.inverse_transform(inv_y)
inv_y = inv_y[:,0]

import math
# calculate RMSE
rmse = math.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:
# save model to use later
save_t_model = True

if save_t_model:
    my_model.save('../models/190512_1030_TrafficAndWeather_final.h5')
    
del my_model

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(20, 6), dpi=80, facecolor='w', edgecolor='k')

plt.plot(inv_y)
plt.plot(inv_yhat)
plt.show()

In [None]:
from matplotlib.pyplot import figure

figure(num=None, figsize=(20, 6), dpi=80, facecolor='y', edgecolor='k')

week_num = 0

num_weeks = 1

plt.plot(inv_y[week_num*7*24:week_num*7*24 + num_weeks*(7*24)])
plt.plot(inv_yhat[week_num*7*24:week_num*7*24 + num_weeks*(7*24)])

In [None]:
figure(num=None, figsize=(20, 6), dpi=80, facecolor='y', edgecolor='k')

s = slice(400,500)

plt.plot(inv_y[s])
plt.plot(inv_yhat[s])

# Backup stuff

In [None]:
# back up model
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=10, batch_size=50000, validation_data=(val_X, val_y), verbose=2, shuffle=True)
