In [20]:
import numpy as np
import time
import tensorflow as tf
import pandas as pd
import pandas_datareader as web
from datetime import date
pd.options.mode.chained_assignment = None
import seaborn as sns
from matplotlib.pylab import rcParams
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

%matplotlib inline

sns.set(style='whitegrid', palette='muted')
rcParams['figure.figsize'] = 14, 8
np.random.seed(1)
tf.random.set_seed(1)

print('Tensorflow version:', tf.__version__) 

Tensorflow version: 2.0.0


In [21]:
today = date.today() 
today_date = today.strftime("%Y-%m-%d")
first_date ='01-01-2000'
df = web.DataReader('^GSPC', data_source='yahoo', start=first_date, end=today_date)
df = pd.DataFrame(df.loc[:,'Close']) 
df.columns=['close']
df['date']=df.index
df = df.reset_index()
df = df.loc[:,['date', 'close']]
df.dtypes

date     datetime64[ns]
close           float64
dtype: object

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df.close = scaler.fit_transform(df[['close']])
df

Unnamed: 0,date,close
0,2000-01-03,0.287380
1,2000-01-04,0.266787
2,2000-01-05,0.267779
3,2000-01-06,0.268274
4,2000-01-07,0.282305
...,...,...
5150,2020-06-23,0.905943
5151,2020-06-24,0.876064
5152,2020-06-25,0.888401
5153,2020-06-26,0.860829


In [23]:
# REMINDER: import plotly.graph_objects as go
fig = go.Figure()  # empty figure is created
fig.add_trace(go.Scatter(x = df.date, y = df.close, mode = 'lines', 
                         name = 'S&P 500 - daily close price')) 
fig.update_layout(showlegend = True, title = 'S&P 500 time series')
fig.show() 

In [24]:
def create_sequences(X, time_steps = 50): 
    y = X # read previous comment    
    Xs, ys = [], [] # ys are used as Xs later on
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i+time_steps)].values) # (i+time_steps)th value not included
        ys.append(y.iloc[i + time_steps])       # now (i+time_steps)th value is included   
    return np.array(Xs), np.array(ys)

train_size = int(len(df) * 0.8) #train -> to collect first 80% observations
train, test = df.iloc[:train_size,], df.iloc[train_size:,] 

X_train, y_train = create_sequences(train[['close']]) #shape(?,1)
X_test, y_test   = create_sequences(test[['close']]) 
X_full, y_full = create_sequences(df[['close']])

In [26]:
timesteps = X_train.shape[1]
num_features = X_train.shape[2]
X_train.shape

(4074, 50, 1)

In [27]:
# Define the LSTM model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=5, strides=1, padding="causal", activation="relu",
                  input_shape=[None, 1]))
model.add(Bidirectional(LSTM(128, return_sequences = True, input_shape = (timesteps, num_features))))    
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64))) 
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss="mse", optimizer="adam")
model.summary() 

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, None, 32)          192       
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 256)         164864    
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               164352    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 329,537
Trainable params: 329,537
Non-trainable params: 0
________________________________________________

In [28]:
start = time.time()

# model.fit(X_train, y_train, batch_size=512, epochs=1, validation_split=0.1)

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, mode='min')
# if there is no imporvements in validation_loss after 3 epochs, the model stops iterating
# and return the output

training = model.fit(
                        X_full, y_full,
                        epochs = 50,
                        batch_size = 32,
                        validation_split = 0.1,
                        callbacks = [early_stop],
                        shuffle = False, # IMPORTANT: WE DONT WANT TO SHUFFLE OBSERVATIONS BY 
                                         # ANY MEANS. Order matters in time series.
) 

print("> Compilation Time : {} seconds".format(round(time.time() - start, 3)))

Train on 4594 samples, validate on 511 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
> Compilation Time : 961.607 seconds


In [33]:
# scaler.inverse_transform(df[['close']][timesteps:]).shape, df[timesteps:].date.shape
# pd.Series(scaler.inverse_transform(df[['close']]).reshape(-1,)) 
test.date[timesteps:]

4174   2016-08-05
4175   2016-08-08
4176   2016-08-09
4177   2016-08-10
4178   2016-08-11
          ...    
5150   2020-06-23
5151   2020-06-24
5152   2020-06-25
5153   2020-06-26
5154   2020-06-29
Name: date, Length: 981, dtype: datetime64[ns]

In [34]:
y_full_pred = model.predict(X_full) 

fig = go.Figure()
fig.add_trace(go.Scatter(x=df.date, 
                        y=pd.Series(scaler.inverse_transform(df[['close']]).reshape(-1,)),
                        mode='lines',
                        name='Close price test')) 
fig.add_trace(go.Scatter(x=df.date[timesteps:], # before: test[timesteps:].date
                        y=pd.Series(scaler.inverse_transform(y_full_pred.reshape(-1,1)).reshape(-1,)),
                        mode='lines',
                        name='Prediction'))
fig.update_layout(showlegend=True, title = 'S&P 500 - checking model performance')
fig.show() 

In [35]:

def predict_n_ahead(n_future_preds):

    ''' n_future_preds - Represents the number of future predictions we want to make
                         This coincides with the number of windows that we will move forward
                         on the test data
    '''
    preds_moving = [float(y_test[-1])]                                     # Use this to store the prediction made on each test window
    moving_test_window = X_test[-1,:].reshape(1,-1,1)    # Creating the first test window
                                                         # Converting to an numpy array
    dates_for_future_values = [df.date[len(df)-1]]
    
    for i in range(n_future_preds):
        preds_one_step = model.predict(moving_test_window) # Note that this is already a scaled prediction so no need to rescale this
        preds_moving.append(float(preds_one_step)) # get the value from the numpy 2D array and append to predictions
        
        preds_one_step = preds_one_step.reshape(1,1,1) # Reshaping the prediction to 3D array for concatenation with moving test window
        moving_test_window = np.append(moving_test_window, preds_one_step).reshape(1,-1,1) # This is the new moving test window, where the first element from the window has been removed and the prediction  has been appended to the end
        moving_test_window = np.delete(moving_test_window, 0, axis=1)
        
        new_date = dates_for_future_values[-1] + np.timedelta64(1,'D') 
        dates_for_future_values.append(new_date) 
        
    preds_moving = scaler.inverse_transform(np.array(preds_moving).reshape(-1,1)) 
    
    return pd.Series(dates_for_future_values), pd.Series(preds_moving.reshape(-1,))  

In [42]:
n_days = 3000
dates_for_future_values, future_values = predict_n_ahead(n_days) 

In [43]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.date[timesteps:], # before: df[timesteps:].date 
                        y=pd.Series(scaler.inverse_transform(df[['close']][timesteps:]).reshape(-1,)),
                        mode='lines',
                        name='Close price')) 
fig.add_trace(go.Scatter(x=dates_for_future_values, 
                        y=future_values,
                        mode='lines',
                        name='Predicted future price'))

fig.update_layout(showlegend=True, title = 'S&P 500 - forecasting prices {} days ahead'.\
                 format(n_days))
fig.show() 