In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.models import Sequential
import plotly
from plotly.graph_objs import Scatter, Layout
import plotly.graph_objs as go

Using Theano backend.
Using gpu device 0: GeForce GTX 960 (CNMeM is disabled, cuDNN 4007)


In [2]:
# Set a random seed to reproduce the results
np.random.seed(1234)

# Load the volume data
volume_data = pd.read_csv('../data/volume_data.csv', header=None)

In [3]:
all_hfs = pd.read_csv('../data/hf_list.csv')

used_hfs = [16911, 16912, 16913, 278, 10528, 16515, 14479, 16551, 6297, 16537]

def find_index_hf(hf_no):
    return all_hfs[all_hfs['HF'] == hf_no].index.tolist()[0]

hfs = [find_index_hf(hf_no) for hf_no in used_hfs]

In [4]:
maxs = []
mins = []
means = []
stds = []

In [5]:
def train_test_traffic_data(sequence_length=50, horizon=15):
    global maxs
    global mins
    global means
    global stds
    maxs = []
    mins = []
    means = []
    stds = []
     # for 30 minutes aggregate divide by 2
    sample_size = int(volume_data.shape[0] * (15/horizon))
    result = []
    # scale data
    # Create data
    for j in hfs:
        road_vol = volume_data[j]
        road_vol = road_vol.replace(0, np.nan) 
        road_vol = road_vol.interpolate().values 
        
        if horizon == 30:
            road_vol = road_vol[0::2] + road_vol[1::2] 
        elif horizon == 45:
            road_vol = road_vol[0::3] + road_vol[1::3] + road_vol[2::3]
        # for 30 minutes aggregate
        # road_vol = road_vol[0::2] + road_vol[1::2]
        
        mean_t = road_vol.mean()
        max_t = road_vol.max()
        min_t = road_vol.min()
        std_t = np.std(road_vol)
        
        mins.append(min_t)
        maxs.append(max_t)
        means.append(mean_t)
        stds.append(std_t)
        
        road_vol = (road_vol - mean_t) / std_t
        temp = []
        for i in range(0, sample_size - sequence_length):
            temp.append(road_vol[i: i + sequence_length])
        result.append(temp)

    result = np.dstack(result)
    #row = round(0.9 * result.shape[0])
    row = result.shape[0] - 5280
    train = result[:row, :, :]
    np.random.shuffle(train)
    X_train = train[:, :-1, :]
    y_train = train[:, -1, :]
    X_test = result[row:, :-1, :]
    y_test = result[row:, -1, :]

    return [X_train, y_train, X_test, y_test]

In [6]:
def lstm_model():
    mdl = Sequential()
    io_dim = len(hfs)
    # a network with len(hfs)-dimensional input,
    # 3 hidden layers of sizes 100, 200, 200
    # and eventually a len(hfs)-dimensional output layer
    layers = [io_dim, 200, 200, io_dim]

    # We also add 20% Dropout in this layer.
    mdl.add(LSTM(
        input_dim=layers[0],
        output_dim=layers[1],
        return_sequences=True))
    mdl.add(Dropout(0.2))
    
    # 3rd hidden layer
    mdl.add(LSTM(
        layers[2],
        return_sequences=False))
    mdl.add(Dropout(0.2))

    # last layer we use is a Dense layer ( = feedforward).
    # Since we are doing a regression, its activation is linear
    mdl.add(Dense(
        output_dim=layers[3]))
    mdl.add(Activation("linear"))

    start = time.time()
    mdl.compile(loss="mse", optimizer="adam")
    print("Compilation Time : ", time.time() - start)
    return mdl

In [7]:
def simple_rnn_model():
    mdl = Sequential()
    io_dim = len(hfs)
    # a network with 1-dimensional input,
    # two hidden layers of sizes 100 and 100
    # and eventually a 1-dimensional output layer
    layers = [io_dim, 200, 200, io_dim]

    # We also add 10% Dropout in this layer.
    mdl.add(SimpleRNN(
        input_dim=layers[0],
        output_dim=layers[1],
        return_sequences=True))
    mdl.add(Dropout(0.1))

    # 3rd hidden layer
    mdl.add(SimpleRNN(
        layers[2],
        return_sequences=False))
    mdl.add(Dropout(0.2))
    
    # last layer we use is a Dense layer ( = feedforward).
    # Since we are doing a regression, its activation is linear
    mdl.add(Dense(
        output_dim=layers[3]))
    mdl.add(Activation("linear"))

    start = time.time()
    mdl.compile(loss="mse", optimizer="adam")
    print("Compilation Time : ", time.time() - start)
    return mdl

In [8]:
def gru_model():
    mdl = Sequential()
    io_dim = len(hfs)
    # a network with 1-dimensional input,
    # two hidden layers of sizes 50 and 100
    # and eventually a 1-dimensional output layer
    layers = [io_dim, 200, 200, io_dim]

    # We also add 20% Dropout in this layer.
    mdl.add(GRU(
        input_dim=layers[0],
        output_dim=layers[1],
        return_sequences=True))
    mdl.add(Dropout(0.2))

    # 2nd hidden layer
    mdl.add(GRU(
        layers[2],
        return_sequences=False))
    mdl.add(Dropout(0.2))
    
    # last layer we use is a Dense layer ( = feedforward).
    # Since we are doing a regression, its activation is linear
    mdl.add(Dense(
        output_dim=layers[3]))
    mdl.add(Activation("linear"))

    start = time.time()
    mdl.compile(loss="mse", optimizer="adam")
    print("Compilation Time : ", time.time() - start)
    return mdl

In [9]:
def run_network(data, mdl_type):
    global_start_time = time.time()
    epochs = 20

    X_train, y_train, X_test, y_test = data

    print('\nData Loaded. Compiling...\n')

    if mdl_type == 'rnn':
        mdl = simple_rnn_model()
    elif mdl_type == 'gru':
        mdl = gru_model()
    elif mdl_type == 'lstm':
        mdl = lstm_model()
    else:
        return

    try:
        mdl.fit(X_train, y_train, batch_size=512,
                nb_epoch=epochs, validation_split=0.05)
        predicted_trffic = mdl.predict(X_test)
    except KeyboardInterrupt:
        print('Training duration (s) : ', time.time() - global_start_time)
        return mdl, y_test, 0

    print('Training duration (s) : ', time.time() - global_start_time)

    return mdl, y_test, predicted_trffic

In [10]:
def plot_predictions(y_test, predicted, horizon):
    y_test_tp = np.transpose(y_test)
    pred_tp = np.transpose(predicted)
   
    i = 2 #hf no 16913
    actual = (y_test_tp[i] * stds[i]) + means[i]
    predictions = (pred_tp[i] * stds[i]) + means[i]
   
    actual = actual[:2880]
    predictions = predictions[:2880]
    
    t = pd.date_range('6/1/2012', freq=(str(horizon)+'Min'), periods=(30*96*15)/horizon)
    trace0 = go.Scatter(
        x = t,
        y = actual,
        name = 'Actual')
    trace1 = go.Scatter(
        x = t,
        y = predictions,
        name = 'Predicted')
    data = [trace0, trace1]

    layout = dict(xaxis = dict(title = 'Time'),
                  yaxis = dict(title = 'Volume'),
                  width = 600, height = 450)

    fig = dict(data=data, layout=layout)
    plotly.offline.plot(fig)

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def print_scores(y_test, predicted):
    y_test_tp = np.transpose(y_test)
    pred_tp = np.transpose(predicted)
    for i in range(0, len(used_hfs)):
        mae = []
        mse = []
        mape = []
        print("Score for location -- ", used_hfs[i])
        actual = y_test_tp[i]
        predictions = pred_tp[i]

        actual = actual[:2880]
        predictions = predictions[:2880]

        actual = (actual * stds[i]) + means[i]
        predictions = (predictions * stds[i]) + means[i]

        mae.append(mean_absolute_error(actual, predictions))
        mse.append(mean_squared_error(actual, predictions))
        mape.append(mean_absolute_percentage_error(actual, predictions))
        print("MAE=", np.array(mae).mean())
        print("RMSE=", math.sqrt(np.array(mse).mean()))
        print("MAPE=", np.array(mape).mean())

In [12]:
data = train_test_traffic_data(50,15)

In [13]:
#Using ADAM
model, y_test, predicted = run_network(data, 'lstm')
plot_predictions(y_test, predicted, 15)


Data Loaded. Compiling...

Compilation Time :  0.14513826370239258
Train on 180346 samples, validate on 9492 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  2634.8064951896667


In [None]:
print_scores(y_test, predicted)

In [14]:
print_scores(y_test, predicted)

Score for location --  16911
MAE= 18.4738563386
RMSE= 26.896590971019677
MAPE= 13.4422331881
Score for location --  16912
MAE= 33.7164163861
RMSE= 46.29273452175229
MAPE= 11.7847674848
Score for location --  16913
MAE= 16.8919732008
RMSE= 23.330038599494074
MAPE= 11.13971222
Score for location --  278
MAE= 21.604247449
RMSE= 31.12366563640115
MAPE= 9.15774795384
Score for location --  10528
MAE= 48.6387054214
RMSE= 73.21846192801802
MAPE= 8.39587659716
Score for location --  16515
MAE= 13.0034327644
RMSE= 18.13298466212703
MAPE= 9.09016768457
Score for location --  14479
MAE= 9.7952562152
RMSE= 13.90688742427811
MAPE= 12.9090974555
Score for location --  16551
MAE= 9.44790041014
RMSE= 14.385696775710072
MAPE= 15.0449544707
Score for location --  6297
MAE= 21.2015338858
RMSE= 29.19259353002577
MAPE= 16.6695313828
Score for location --  16537
MAE= 19.2027928853
RMSE= 27.58364931949823
MAPE= 15.8594734262
Score for location --  16538
MAE= 10.3288225063
RMSE= 14.895839977583199
MAPE= 21.60

In [15]:
from keras.utils.visualize_util import plot
plot(model, to_file='../latex-thesis/Figures/lstm_multi_variate.png')

In [16]:
model, y_test, predicted = run_network(data, 'rnn')
plot_predictions(y_test, predicted, 15)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.09337639808654785
Train on 180346 samples, validate on 9492 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  605.0378684997559
Score for location --  16911
MAE= 21.4180903053
RMSE= 30.969327292836034
MAPE= 21.0719342879
Score for location --  16912
MAE= 36.7263651369
RMSE= 50.26987355569718
MAPE= 13.2481267114
Score for location --  16913
MAE= 17.6024973365
RMSE= 24.082346822842666
MAPE= 11.710264715
Score for location --  278
MAE= 23.9281443956
RMSE= 34.337499167262514
MAPE= 10.5690278426
Score for location --  10528
MAE= 54.2409738529
RMSE= 82.06406412624041
MAPE= 8.56747719451
Score for location --  16515
MAE= 13.9715130791
RMSE= 19.606265829725707
MAPE= 9.89884457617
Score for location --  14479
MAE= 10.3848932914
RMSE= 1

In [17]:
model, y_test, predicted = run_network(data, 'gru')
plot_predictions(y_test, predicted, 15)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.12986278533935547
Train on 180346 samples, validate on 9492 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  2145.4050517082214
Score for location --  16911
MAE= 19.614308729
RMSE= 27.318034313273593
MAPE= 15.4321001333
Score for location --  16912
MAE= 34.5636534159
RMSE= 46.71282794443207
MAPE= 11.9897595904
Score for location --  16913
MAE= 16.8822673671
RMSE= 23.238239629039057
MAPE= 10.6814780894
Score for location --  278
MAE= 22.1302403664
RMSE= 32.094550354011545
MAPE= 9.64349021489
Score for location --  10528
MAE= 48.969545147
RMSE= 74.29664903798668
MAPE= 7.88725916246
Score for location --  16515
MAE= 12.9478286805
RMSE= 18.235937190701137
MAPE= 9.09353587302
Score for location --  14479
MAE= 10.2359967053
RMSE= 1

In [18]:
data = train_test_traffic_data(50,30)

In [19]:
model, y_test, predicted = run_network(data, 'lstm')
plot_predictions(y_test, predicted, 30)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.8224425315856934
Train on 87641 samples, validate on 4613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  1307.692845582962
Score for location --  16911
MAE= 34.8373849803
RMSE= 49.38352727773384
MAPE= 11.2766515383
Score for location --  16912
MAE= 59.9773072475
RMSE= 80.6007543616347
MAPE= 10.6357308331
Score for location --  16913
MAE= 27.2398337555
RMSE= 37.34380304839516
MAPE= 8.38121869548
Score for location --  278
MAE= 42.0839828808
RMSE= 60.697106787406504
MAPE= 8.48570793633
Score for location --  10528
MAE= 98.3736110205
RMSE= 153.2230878186524
MAPE= 7.66062084482
Score for location --  16515
MAE= 25.4705983799
RMSE= 35.05898506956178
MAPE= 9.17275259992
Score for location --  14479
MAE= 16.5823838004
RMSE= 23.050

In [20]:
model, y_test, predicted = run_network(data, 'rnn')
plot_predictions(y_test, predicted, 30)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.09027552604675293
Train on 87641 samples, validate on 4613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  302.7214603424072
Score for location --  16911
MAE= 39.6094768434
RMSE= 54.317785896624116
MAPE= 16.8055110254
Score for location --  16912
MAE= 62.3687443437
RMSE= 84.77454771357252
MAPE= 10.6934416049
Score for location --  16913
MAE= 28.9846614766
RMSE= 38.99115601252782
MAPE= 9.92909104434
Score for location --  278
MAE= 45.8551680703
RMSE= 65.37353035795506
MAPE= 9.51849659732
Score for location --  10528
MAE= 119.21696685
RMSE= 171.45068846203506
MAPE= 13.1365469767
Score for location --  16515
MAE= 33.6861738244
RMSE= 42.68929018413291
MAPE= 18.4898529629
Score for location --  14479
MAE= 17.3965213833
RMSE= 24.2

In [21]:
model, y_test, predicted = run_network(data, 'gru')
plot_predictions(y_test, predicted, 30)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.13332629203796387
Train on 87641 samples, validate on 4613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  1053.412100315094
Score for location --  16911
MAE= 35.2786439671
RMSE= 49.71022859430127
MAPE= 12.248723862
Score for location --  16912
MAE= 61.4741745541
RMSE= 83.15482327732047
MAPE= 10.1214823629
Score for location --  16913
MAE= 27.9662211092
RMSE= 38.73012041186704
MAPE= 8.50860863477
Score for location --  278
MAE= 44.4400605499
RMSE= 63.036738360353894
MAPE= 9.24274389467
Score for location --  10528
MAE= 101.18523346
RMSE= 156.61170114273975
MAPE= 7.82700847062
Score for location --  16515
MAE= 25.1302749325
RMSE= 34.750211844386634
MAPE= 8.50982177143
Score for location --  14479
MAE= 17.4504469073
RMSE= 24.0

In [22]:
data = train_test_traffic_data(50,45)

In [23]:
model, y_test, predicted = run_network(data,'lstm')
plot_predictions(y_test, predicted, 45)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.14387798309326172
Train on 56739 samples, validate on 2987 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  854.2795176506042
Score for location --  16911
MAE= 49.5279348404
RMSE= 74.41908977537739
MAPE= 10.7647160186
Score for location --  16912
MAE= 80.7734128202
RMSE= 114.65745225307414
MAPE= 8.26565549167
Score for location --  16913
MAE= 38.1637087941
RMSE= 51.23748954289149
MAPE= 7.30430503369
Score for location --  278
MAE= 63.1893121194
RMSE= 89.36024566913335
MAPE= 8.94884401213
Score for location --  10528
MAE= 146.922944347
RMSE= 217.88657908002568
MAPE= 7.47544381659
Score for location --  16515
MAE= 36.9814829836
RMSE= 51.51672591007299
MAPE= 8.74048442971
Score for location --  14479
MAE= 23.0587044193
RMSE= 32.

In [24]:
model, y_test, predicted = run_network(data, 'rnn')
plot_predictions(y_test, predicted, 30)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.09089922904968262
Train on 56739 samples, validate on 2987 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  197.16770243644714
Score for location --  16911
MAE= 55.5314462354
RMSE= 85.75922007737587
MAPE= 15.9779847905
Score for location --  16912
MAE= 89.4909429107
RMSE= 129.12866013338873
MAPE= 10.262761006
Score for location --  16913
MAE= 42.9584989163
RMSE= 55.82524194539991
MAPE= 10.1727936595
Score for location --  278
MAE= 66.9147374171
RMSE= 94.75954998247651
MAPE= 9.616866913
Score for location --  10528
MAE= 153.49957044
RMSE= 225.81726288141715
MAPE= 8.58687722199
Score for location --  16515
MAE= 37.5518795136
RMSE= 54.22020724687148
MAPE= 9.5046305685
Score for location --  14479
MAE= 26.0305981627
RMSE= 34.6341

In [25]:
model, y_test, predicted = run_network(data, 'gru')
plot_predictions(y_test, predicted, 30)
print_scores(y_test, predicted)


Data Loaded. Compiling...

Compilation Time :  0.13007116317749023
Train on 56739 samples, validate on 2987 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training duration (s) :  694.7314169406891
Score for location --  16911
MAE= 49.9721079909
RMSE= 74.08778009527046
MAPE= 11.6677845621
Score for location --  16912
MAE= 82.7051381751
RMSE= 114.95718825454209
MAPE= 8.90417144177
Score for location --  16913
MAE= 38.3948360356
RMSE= 51.475846421854214
MAPE= 7.59042294297
Score for location --  278
MAE= 63.5760660113
RMSE= 89.54101008479809
MAPE= 9.12118263307
Score for location --  10528
MAE= 141.701210049
RMSE= 210.62165199765337
MAPE= 7.17659161473
Score for location --  16515
MAE= 35.0849083579
RMSE= 48.27939598891731
MAPE= 8.80365246738
Score for location --  14479
MAE= 23.2860862718
RMSE= 32