In [1]:
from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from pandas import datetime
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from math import sqrt
import matplotlib
# be able to save images on server
matplotlib.use('Agg')
from matplotlib import pyplot
import numpy

  from pandas import datetime


In [2]:
def get_mape(y_true, y_pred): 
    """
    Compute mean absolute percentage error (MAPE)
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
# frame a sequence as a supervised learning problem
def timeseries_to_supervised(data, lag=1):
    df = DataFrame(data)
    columns = [df.shift(i) for i in range(1, lag+1)]
    columns.append(df)
    df = concat(columns, axis=1)
    df = df.drop(0)
    return df

In [4]:
# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return Series(diff)

In [37]:
# invert differenced value
def inverse_difference(history, yhat, interval=1):
    return yhat + history[-interval]

In [5]:
# scale train and test data to [-1, 1]
def scale(train, test):
    # fit scaler
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(train)
    # transform train
    train = train.reshape(train.shape[0], train.shape[1])
    train_scaled = scaler.transform(train)
    # transform test
    test = test.reshape(test.shape[0], test.shape[1])
    test_scaled = scaler.transform(test)
    return scaler, train_scaled, test_scaled

In [6]:
# inverse scaling for a forecasted value
def invert_scale(scaler, X, yhat):
    new_row = [x for x in X] + [yhat]
    array = numpy.array(new_row)
    array = array.reshape(1, len(array))
    inverted = scaler.inverse_transform(array)
    return inverted[0, -1]

In [7]:
# evaluate the model on a dataset, returns RMSE in transformed units
def evaluate(model, raw_data, scaled_dataset, scaler, offset, batch_size):
    # separate
    X, y = scaled_dataset[:,0:-1], scaled_dataset[:,-1]
    # reshape
    reshaped = X.reshape(len(X), 1, 1)
    # forecast dataset
    output = model.predict(reshaped, batch_size=batch_size)
    # invert data transforms on forecast
    predictions = list()
    for i in range(len(output)):
        yhat = output[i,0]
        # invert scaling
        yhat = invert_scale(scaler, X[i], yhat)
        # invert differencing
        yhat = yhat + raw_data[i]
        # store forecast
        predictions.append(yhat)
    # report performance
    rmse = sqrt(mean_squared_error(raw_data[1:], predictions))
    return rmse

In [32]:
# fit an LSTM network to training data
def fit_lstm(train, batch_size, nb_epoch, neurons):
    X, y = train[:, 0:-1], train[:, -1]
    X = X.reshape(X.shape[0], 1, X.shape[1])
    model = Sequential()
    model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(nb_epoch):
        model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
        model.reset_states()
    return model

In [52]:
# run diagnostic experiments
def run(series, epochs):
    # transform data to be stationary
    raw_values = series['Close'].values
    diff_values = difference(raw_values, 1)
    # transform data to be supervised learning
    supervised = timeseries_to_supervised(diff_values, 1)
    supervised_values = supervised.values
    # split data into train and test-sets
    test_size = 0.2                                                # proportion of dataset to be used as test set
    cv_size = 0.2                                                   # proportion of dataset to be used as cross-validation set
    num_cv = int(cv_size*len(series))
    num_test = int(test_size*len(series))
    num_train = len(series) - num_cv - num_test
    print("num_train = " + str(num_train))
    print("num_cv = " + str(num_cv))
    print("num_test = " + str(num_test))
    
    #train, test = supervised_values[:num_train], supervised_values[-12:]
    train = supervised_values[:num_train]
    cv = supervised_values[num_train:num_train+num_cv]
    train_cv = supervised_values[:num_train+num_cv]
    test = supervised_values[num_train+num_cv:]
    
    # transform the scale of the data
    scaler, train_scaled, test_scaled = scale(train, test)
#     # fit and evaluate model
#     train_trimmed = train_scaled[2:, :]
#     # config
#     repeats = 5
#     n_batch = batch
#     n_epochs = epochs
#     n_neurons = neurons
#     # run diagnostic tests
#     for i in range(repeats):
#         history = fit_lstm(train_trimmed, test_scaled, raw_values, scaler, n_batch, n_epochs, n_neurons)
#         pyplot.plot(history['train'], color='blue')
#         pyplot.plot(history['test'], color='orange')
#         print('%d) TrainRMSE=%f, TestRMSE=%f' % (i, history['train'].iloc[-1], history['test'].iloc[-1]))
#     pyplot.savefig('epochs_diagnostic.png')

    # run experiment
    error_scores = list()
    for r in range(5): # repeat 5 times
        # fit the model
        batch_size = 2
        train_trimmed = train_scaled[2:, :]
        lstm_model = fit_lstm(train_trimmed, batch_size, epochs, 1)
        # forecast the entire training dataset to build up state for forecasting
        train_reshaped = train_trimmed[:, 0].reshape(len(train_trimmed), 1, 1)
        lstm_model.predict(train_reshaped, batch_size=batch_size)
        # forecast test dataset
        test_reshaped = test_scaled[:,0:-1]
        test_reshaped = test_reshaped.reshape(len(test_reshaped), 1, 1)
        output = lstm_model.predict(test_reshaped, batch_size=batch_size)
        predictions = list()
        for i in range(len(output)):
            yhat = output[i,0]
            X = test_scaled[i, 0:-1]
            # invert scaling
            yhat = invert_scale(scaler, X, yhat)
            # invert differencing
            yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
            # store forecast
            predictions.append(yhat)
        # report performance
        # print(len(predictions))
        rmse = sqrt(mean_squared_error(raw_values[num_train+num_cv+2:], predictions))
        print('%d) Test RMSE: %.3f' % (r+1, rmse))
        error_scores.append(rmse)
    return error_scores

# Tuning the Number of Epochs

In [53]:
series = read_csv('../data/dataset.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)
# experiment
results = DataFrame()
# vary training epochs
epochs = [100, 500, 1000]
for e in epochs:
    results[str(e)] = run(series, e)
# summarize results
print(results.describe())
# save boxplot
results.boxplot()
pyplot.savefig('boxplot_epochs.png')

num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1703.747
2) Test RMSE: 1702.548
3) Test RMSE: 1702.533
4) Test RMSE: 1705.193
5) Test RMSE: 1704.416
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1719.411
2) Test RMSE: 1718.971
3) Test RMSE: 1718.890
4) Test RMSE: 1711.849
5) Test RMSE: 1719.525
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1719.326
2) Test RMSE: 1720.551
3) Test RMSE: 1718.219
4) Test RMSE: 1702.538
5) Test RMSE: 1702.538
               100          500         1000
count     5.000000     5.000000     5.000000
mean   1703.687542  1717.729271  1712.634508
std       1.165236     3.298373     9.253619
min    1702.533243  1711.849267  1702.537997
25%    1702.548222  1718.889873  1702.537999
50%    1703.747220  1718.970958  1718.219465
75%    1704.416033  1719.410791  1719.326435
max    1705.192992  1719.525465  1720.550643


# Tuning the Batch Size

In [56]:
# run diagnostic experiments
def run(series, batch_size):
    # transform data to be stationary
    raw_values = series['Close'].values
    diff_values = difference(raw_values, 1)
    # transform data to be supervised learning
    supervised = timeseries_to_supervised(diff_values, 1)
    supervised_values = supervised.values
    # split data into train and test-sets
    test_size = 0.2                                                # proportion of dataset to be used as test set
    cv_size = 0.2                                                   # proportion of dataset to be used as cross-validation set
    num_cv = int(cv_size*len(series))
    num_test = int(test_size*len(series))
    num_train = len(series) - num_cv - num_test
    print("num_train = " + str(num_train))
    print("num_cv = " + str(num_cv))
    print("num_test = " + str(num_test))
    
    #train, test = supervised_values[:num_train], supervised_values[-12:]
    train = supervised_values[:num_train]
    cv = supervised_values[num_train:num_train+num_cv]
    train_cv = supervised_values[:num_train+num_cv]
    test = supervised_values[num_train+num_cv:]
    
    # transform the scale of the data
    scaler, train_scaled, test_scaled = scale(train, test)
    
    # run experiment
    error_scores = list()
    for r in range(5): # repeat 5 times
        # fit the model
        epochs = 100
        train_trimmed = train_scaled[2:, :]
        lstm_model = fit_lstm(train_trimmed, batch_size, epochs, 1)
        # forecast the entire training dataset to build up state for forecasting
        train_reshaped = train_trimmed[:, 0].reshape(len(train_trimmed), 1, 1)
        lstm_model.predict(train_reshaped, batch_size=batch_size)
        # forecast test dataset
        test_reshaped = test_scaled[:,0:-1]
        test_reshaped = test_reshaped.reshape(len(test_reshaped), 1, 1)
        output = lstm_model.predict(test_reshaped, batch_size=batch_size)
        predictions = list()
        for i in range(len(output)):
            yhat = output[i,0]
            X = test_scaled[i, 0:-1]
            # invert scaling
            yhat = invert_scale(scaler, X, yhat)
            # invert differencing
            yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
            # store forecast
            predictions.append(yhat)
        # report performance
        # print(len(predictions))
        rmse = sqrt(mean_squared_error(raw_values[num_train+num_cv+2:], predictions))
        print('%d) Test RMSE: %.3f' % (r+1, rmse))
        # Calculate MAPE
        mape = get_mape(raw_values[num_train+num_cv+2:], predictions)
        print('%d) Test MAPE: %.3f' % mape)
        error_scores.append(rmse)
    return error_scores

In [57]:
# load dataset
series = read_csv('../data/dataset.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)
# experiment
results = DataFrame()
# vary training batches
batches = [1, 2, 4]
for b in batches:
    results[str(b)] = run(series, b)
# summarize results
print(results.describe())
# save boxplot
results.boxplot()
pyplot.savefig('boxplot_batches.png')

num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1701.655
2) Test RMSE: 1704.484
3) Test RMSE: 1709.741
4) Test RMSE: 1701.651
5) Test RMSE: 1701.653
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1702.543
2) Test RMSE: 1704.679
3) Test RMSE: 1707.922
4) Test RMSE: 1702.548
5) Test RMSE: 1709.524
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1700.624
2) Test RMSE: 1709.342
3) Test RMSE: 1703.242
4) Test RMSE: 1706.924
5) Test RMSE: 1710.377
                 1            2            4
count     5.000000     5.000000     5.000000
mean   1703.836946  1705.443145  1706.101425
std       3.521020     3.169207     4.112080
min    1701.650832  1702.543104  1700.623546
25%    1701.653166  1702.547774  1703.241787
50%    1701.655222  1704.678801  1706.923559
75%    1704.484015  1707.922197  1709.341556
max    1709.741496  1709.523851  1710.376676


# Tuning the Number of Neurons

In [58]:
# run diagnostic experiments
def run(series, neurons):
    # transform data to be stationary
    raw_values = series['Close'].values
    diff_values = difference(raw_values, 1)
    # transform data to be supervised learning
    supervised = timeseries_to_supervised(diff_values, 1)
    supervised_values = supervised.values
    # split data into train and test-sets
    test_size = 0.2                                                # proportion of dataset to be used as test set
    cv_size = 0.2                                                   # proportion of dataset to be used as cross-validation set
    num_cv = int(cv_size*len(series))
    num_test = int(test_size*len(series))
    num_train = len(series) - num_cv - num_test
    print("num_train = " + str(num_train))
    print("num_cv = " + str(num_cv))
    print("num_test = " + str(num_test))
    
    #train, test = supervised_values[:num_train], supervised_values[-12:]
    train = supervised_values[:num_train]
    cv = supervised_values[num_train:num_train+num_cv]
    train_cv = supervised_values[:num_train+num_cv]
    test = supervised_values[num_train+num_cv:]
    
    # transform the scale of the data
    scaler, train_scaled, test_scaled = scale(train, test)
    
    # run experiment
    error_scores = list()
    for r in range(5): # repeat 5 times
        # fit the model
        epochs = 100
        batch_size = 1
        train_trimmed = train_scaled[2:, :]
        lstm_model = fit_lstm(train_trimmed, batch_size, epochs, neurons)
        # forecast the entire training dataset to build up state for forecasting
        train_reshaped = train_trimmed[:, 0].reshape(len(train_trimmed), 1, 1)
        lstm_model.predict(train_reshaped, batch_size=batch_size)
        # forecast test dataset
        test_reshaped = test_scaled[:,0:-1]
        test_reshaped = test_reshaped.reshape(len(test_reshaped), 1, 1)
        output = lstm_model.predict(test_reshaped, batch_size=batch_size)
        predictions = list()
        for i in range(len(output)):
            yhat = output[i,0]
            X = test_scaled[i, 0:-1]
            # invert scaling
            yhat = invert_scale(scaler, X, yhat)
            # invert differencing
            yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
            # store forecast
            predictions.append(yhat)
        # report performance
        # print(len(predictions))
        rmse = sqrt(mean_squared_error(raw_values[num_train+num_cv+2:], predictions))
        print('%d) Test RMSE: %.3f' % (r+1, rmse))
        error_scores.append(rmse)
    return error_scores

In [59]:
# load dataset
series = read_csv('../data/dataset.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)
# experiment
results = DataFrame()
# vary neurons
neurons = [1, 2, 3, 4, 5]
for n in neurons:
    results[str(n)] = run(series, n)
# summarize results
print(results.describe())
# save boxplot
results.boxplot()
pyplot.savefig('boxplot_neurons.png')

num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1701.651
2) Test RMSE: 1705.397
3) Test RMSE: 1705.758
4) Test RMSE: 1701.654
5) Test RMSE: 1701.656
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1720.423
2) Test RMSE: 1710.231
3) Test RMSE: 1701.706
4) Test RMSE: 1721.935
5) Test RMSE: 1714.259
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1721.752
2) Test RMSE: 1718.646
3) Test RMSE: 1714.947
4) Test RMSE: 1721.116
5) Test RMSE: 1722.583
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1721.643
2) Test RMSE: 1725.821
3) Test RMSE: 1722.500
4) Test RMSE: 1712.994
5) Test RMSE: 1718.563
num_train = 438
num_cv = 146
num_test = 146
1) Test RMSE: 1743.555
2) Test RMSE: 1727.691
3) Test RMSE: 1736.798
4) Test RMSE: 1721.762
5) Test RMSE: 1723.559
                 1            2            3            4            5
count     5.000000     5.000000     5.000000     5.000000     5.000000
mean   1703.223307  1713.710852  1719.808973  1720.304392  1730

AssertionError: 1

In [None]:
# transform data to be stationary
raw_values = series['Close'].values
diff_values = difference(raw_values, 1)
# transform data to be supervised learning
supervised = timeseries_to_supervised(diff_values, 1)
supervised_values = supervised.values
# split data into train and test-sets
test_size = 0.2                                                # proportion of dataset to be used as test set
cv_size = 0.2                                                   # proportion of dataset to be used as cross-validation set
num_cv = int(cv_size*len(series))
num_test = int(test_size*len(series))
num_train = len(series) - num_cv - num_test
print("num_train = " + str(num_train))
print("num_cv = " + str(num_cv))
print("num_test = " + str(num_test))

#train, test = supervised_values[:num_train], supervised_values[-12:]
train = supervised_values[:num_train]
cv = supervised_values[num_train:num_train+num_cv]
train_cv = supervised_values[:num_train+num_cv]
test = supervised_values[num_train+num_cv:]

# transform the scale of the data
scaler, train_scaled, test_scaled = scale(train, test)
train_trimmed = train_scaled[2:, :]
    
lstm_model = fit_lstm(train_trimmed, 1, 100, 1)
lstm_model.predict(train_reshaped, batch_size=batch_size)

test_reshaped = test_scaled[:,0:-1]
test_reshaped = test_reshaped.reshape(len(test_reshaped), 1, 1)
output = lstm_model.predict(test_reshaped, batch_size=batch_size)

print(output)