# Time-Series with LSTM 

Description: This program uses an artificial recurrent neural network called Long Short Term Memory (LSTM) to predict the closing stock price of a corporation (Apple Inc.) using the past 60 day stock price.

In [22]:
#Import the libraries
!pip install --upgrade tensorflow

import math
import pandas_datareader as web
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import tensorflow_datasets as tfds
from tensorflow.keras import Sequential
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
import math

print('TensorFlow version: {}'.format(tf.__version__))
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

Requirement already up-to-date: tensorflow in /usr/local/lib/python3.6/dist-packages (2.1.0)
TensorFlow version: 2.1.0


In [3]:
%tensorflow_version 2.x
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow is already loaded. Please restart the runtime to change versions.
Found GPU at: /device:GPU:0


# Get the data

In [4]:
#Get the stock quote
df = web.DataReader('AAPL', data_source='yahoo', start='2012-01-01', end='2020-02-28')
#Show teh data
df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-03,58.92857,58.42857,58.485714,58.747143,75555200.0,50.994907
2012-01-04,59.240002,58.468571,58.57143,59.062859,65005500.0,51.26897
2012-01-05,59.792858,58.952858,59.278572,59.718571,67817400.0,51.838169
2012-01-06,60.392857,59.888573,59.967144,60.342857,79573200.0,52.380054
2012-01-09,61.107143,60.192856,60.785713,60.247143,98506100.0,52.29697


In [5]:
df.index

DatetimeIndex(['2012-01-03', '2012-01-04', '2012-01-05', '2012-01-06',
               '2012-01-09', '2012-01-10', '2012-01-11', '2012-01-12',
               '2012-01-13', '2012-01-17',
               ...
               '2020-02-14', '2020-02-18', '2020-02-19', '2020-02-20',
               '2020-02-21', '2020-02-24', '2020-02-25', '2020-02-26',
               '2020-02-27', '2020-02-28'],
              dtype='datetime64[ns]', name='Date', length=2052, freq=None)

In [6]:
px.line(df, x=df.index, y='Close')

## Training and Test Data

To be able to create good predictions we will slide a window of 60 values over our data and use these values to predict the next closing value.

For example: The first 60 closing values to predict the 61 values.

In [7]:
#Create a new dataframe with only the 'Close column
data = df.filter(['Close'])
#Convert the dataframe to a numpy array
dataset = data.values
#Get the number of rows to train the model on
training_data_len = math.ceil( len(dataset) * .8 )

training_data_len

1642

In [8]:
data

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2012-01-03,58.747143
2012-01-04,59.062859
2012-01-05,59.718571
2012-01-06,60.342857
2012-01-09,60.247143
...,...
2020-02-24,298.179993
2020-02-25,288.079987
2020-02-26,292.649994
2020-02-27,273.519989


In [0]:
window_size = 60

In [33]:
len(dataset)

2052

In [0]:
scaler = MinMaxScaler((0, 1))
dataset = scaler.fit_transform(dataset)

In [0]:
train_data = dataset[0:training_data_len]
test_data = dataset[training_data_len - 60:]

In [0]:
# test_data

In [98]:
print(train_data.shape)
print(test_data.shape)
print('Train & Test Data length {} and {}, combined length: {} - total length of original data: {}'.format(train_data.shape[0], test_data.shape[0],
                                                                                                   (train_data.shape[0] + test_data.shape[0]),
                                                                                                   len(dataset)))
print('The resulting difference must match the windowsize ({})'.format(window_size))

(1642, 1)
(470, 1)
Train & Test Data length 1642 and 470, combined length: 2112 - total length of original data: 2052
The resulting difference must match the windowsize (60)


In [99]:
# Split the data into X_train and y_train data sets
X_train = []
y_train = []

for i in range(window_size, len(train_data)):
  X_train.append(train_data[i-window_size:i])
  y_train.append(train_data[i])

X_train = np.array(X_train)
y_train = np.array(y_train)

print(X_train.shape)
print(y_train.shape)

(1582, 60, 1)
(1582, 1)


In [100]:
# Create the data sets X_test and y_test
X_test = []
y_test = []

for i in range(window_size, len(test_data)):
  X_test.append(test_data[i-window_size:i])
  y_test.append(test_data[i])

X_test = np.array(X_test)
y_test = np.array(y_test)

print('Type: {} and shape: {}'.format(type(X_test), X_test.shape))
print('Type: {} and shape: {}'.format(type(X_test), y_test.shape))

Type: <class 'numpy.ndarray'> and shape: (410, 60, 1)
Type: <class 'numpy.ndarray'> and shape: (410, 1)


In [101]:
print(len(x_train[1]))
x_train[1]

60


array([0.01205872, 0.01447467, 0.01677483, 0.01642217, 0.01721696,
       0.01685378, 0.01624321, 0.01541158, 0.01798544, 0.02030665,
       0.01959081, 0.01566949, 0.01941185, 0.01572739, 0.02954412,
       0.02847562, 0.02987047, 0.03288645, 0.03471289, 0.03456025,
       0.03399706, 0.03639722, 0.03865526, 0.04121335, 0.04534521,
       0.05402475, 0.05415633, 0.05898825, 0.06259901, 0.05639331,
       0.05878296, 0.0587356 , 0.06543606, 0.06448335, 0.06624662,
       0.06941526, 0.07117856, 0.07625785, 0.07995811, 0.08102658,
       0.0814003 , 0.07507357, 0.07354712, 0.07377347, 0.07972122,
       0.08139505, 0.08499001, 0.09346428, 0.10477032, 0.10265438,
       0.10265964, 0.11083389, 0.11339195, 0.11157078, 0.10990752,
       0.10817582, 0.11392883, 0.11787649, 0.11952923, 0.11544473])

We see that there are 60 values for every predicted `y_value`.

## Create a LSTM Network

In [0]:
def gpu_lstm(X_train, X_test, y_train, y_test, window_size):
  with tf.device('/device:GPU:0'):

    #Build the LSTM model
    model = Sequential()
    model.add(tf.keras.layers.LSTM(50, return_sequences=True, input_shape= (window_size, 1)))
    model.add(tf.keras.layers.LSTM(50, return_sequences= False))
    model.add(tf.keras.layers.Dense(25))
    model.add(tf.keras.layers.Dense(1))

    # Compile the model
    # model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Start the training process
    model.fit(x=X_train, y=y_train, batch_size=1, epochs=5)

    model.summary()

    # Evaluate the model performance with test data
    # test_loss = model.evaluate(x=X_test, y=y_test,verbose=0)

    return model

In [128]:
lstm_model = gpu_lstm(X_train, X_test, y_train, y_test, window_size=window_size)

Train on 1582 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_22 (LSTM)               (None, 60, 50)            10400     
_________________________________________________________________
lstm_23 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_20 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trainable params: 0
_________________________________________________________________


In [0]:
#Get the models predicted price values 
predictions = lstm_model.predict(X_test)
predictions = scaler.inverse_transform(predictions)

In [0]:
preds_length = list(range(0, len(predictions)))

In [131]:
print(len(preds_length))
print(len(predictions))

410
410


In [138]:
# Plot 
fig = px.line()

# Only thing I figured is - I could do this 
fig.add_scatter(x=train.index, y=train['Close'], mode='lines', name='Train Data')
fig.add_scatter(x=valid.index, y=valid['Predictions'], mode='lines', name='Predictions')
fig.add_scatter(x=valid.index, y=valid['Close'], mode='lines', name='Real Values')

# Show plot 
fig.show();

We will create functions to be able to test different windowsizes.

In [0]:
def create_dataset(df, filter = 'Close', ratio = .8, window_size=60):

  #Create a new dataframe with only the 'Close column
  data = df.filter([filter])
  #Convert the dataframe to a numpy array
  dataset = data.values
  #Get the number of rows to train the model on
  training_data_len = math.ceil( len(dataset) * .8 )

  # Set window size
  window_size = window_size

  # Scale Data
  scaler = MinMaxScaler((0, 1))
  dataset = scaler.fit_transform(dataset)

  train_data = dataset[0:training_data_len]
  test_data = dataset[training_data_len - window_size:]

  print(train_data.shape)
  print(test_data.shape)
  print('Train & Test Data length {} and {}, combined length: {} - total length of original data: {}'.format(train_data.shape[0], test_data.shape[0],
                                                                                                    (train_data.shape[0] + test_data.shape[0]),
                                                                                                    len(dataset)))
  print('The resulting difference must match the windowsize ({})'.format(window_size))

  # Split the data into X_train and y_train data sets
  X_train = []
  y_train = []

  for i in range(window_size, len(train_data)):
    X_train.append(train_data[i-window_size:i])
    y_train.append(train_data[i])

  X_train = np.array(X_train)
  y_train = np.array(y_train)

  print('X_train: {}'.format(X_train.shape))
  print('y_train: {}'.format(y_train.shape))

  # Create the data sets X_test and y_test
  X_test = []
  y_test = []

  for i in range(window_size, len(test_data)):
    X_test.append(test_data[i-window_size:i])
    y_test.append(test_data[i])

  X_test = np.array(X_test)
  y_test = np.array(y_test)

  print('X_test: {}'.format(X_test.shape))
  print('y_test: {}'.format(y_test.shape))

  return X_train, X_test, y_train, y_test

In [146]:
X_train, X_test, y_train, y_test = create_dataset(df, window_size=30)

(1642, 1)
(440, 1)
Train & Test Data length 1642 and 440, combined length: 2082 - total length of original data: 2052
The resulting difference must match the windowsize (30)
X_train: (1612, 30, 1)
y_train: (1612, 1)
X_test: (410, 30, 1)
y_test: (410, 1)


In [147]:
lstm_model = gpu_lstm(X_train, X_test, y_train, y_test, window_size=30)

Train on 1612 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_26 (LSTM)               (None, 30, 50)            10400     
_________________________________________________________________
lstm_27 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_24 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trainable params: 0
_________________________________________________________________


In [0]:
def plot_results(lstm_model, X_test, y_test, name, data = df.filter(['Close'])):
  #Get the models predicted price values 
  predictions = lstm_model.predict(X_test)
  predictions = scaler.inverse_transform(predictions)

  preds_length = list(range(0, len(predictions)))

  loss = lstm_model.evaluate(X_test, y_test)
  losses[name] = loss

  #Plot the data
  train = data[:training_data_len]
  valid = data[training_data_len:]
  valid['Predictions'] = predictions

  # Plot 
  fig = px.line()

  # Only thing I figured is - I could do this 
  fig.add_scatter(x=train.index, y=train['Close'], mode='lines', name='Train Data')
  fig.add_scatter(x=valid.index, y=valid['Predictions'], mode='lines', name='Predictions')
  fig.add_scatter(x=valid.index, y=valid['Close'], mode='lines', name='Real Values')

  # Show plot 
  fig.show();

In [151]:
plot_results(lstm_model, X_test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [161]:
windows = [20, 40, 60, 80, 100]
losses = {}

for window in windows:
  X_train, X_test, y_train, y_test = create_dataset(df, window_size=window)
  lstm_model = gpu_lstm(X_train, X_test, y_train, y_test, window_size=window)
  plot_results(lstm_model, X_test, y_test, name=window)

(1642, 1)
(430, 1)
Train & Test Data length 1642 and 430, combined length: 2072 - total length of original data: 2052
The resulting difference must match the windowsize (20)
X_train: (1622, 20, 1)
y_train: (1622, 1)
X_test: (410, 20, 1)
y_test: (410, 1)
Train on 1622 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_40 (LSTM)               (None, 20, 50)            10400     
_________________________________________________________________
lstm_41 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_38 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trainab



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(1642, 1)
(450, 1)
Train & Test Data length 1642 and 450, combined length: 2092 - total length of original data: 2052
The resulting difference must match the windowsize (40)
X_train: (1602, 40, 1)
y_train: (1602, 1)
X_test: (410, 40, 1)
y_test: (410, 1)
Train on 1602 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_42 (LSTM)               (None, 40, 50)            10400     
_________________________________________________________________
lstm_43 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_40 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_41 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trainab



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(1642, 1)
(470, 1)
Train & Test Data length 1642 and 470, combined length: 2112 - total length of original data: 2052
The resulting difference must match the windowsize (60)
X_train: (1582, 60, 1)
y_train: (1582, 1)
X_test: (410, 60, 1)
y_test: (410, 1)
Train on 1582 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_44 (LSTM)               (None, 60, 50)            10400     
_________________________________________________________________
lstm_45 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_42 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_43 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trainab



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(1642, 1)
(490, 1)
Train & Test Data length 1642 and 490, combined length: 2132 - total length of original data: 2052
The resulting difference must match the windowsize (80)
X_train: (1562, 80, 1)
y_train: (1562, 1)
X_test: (410, 80, 1)
y_test: (410, 1)
Train on 1562 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_46 (LSTM)               (None, 80, 50)            10400     
_________________________________________________________________
lstm_47 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_44 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_45 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trainab



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(1642, 1)
(510, 1)
Train & Test Data length 1642 and 510, combined length: 2152 - total length of original data: 2052
The resulting difference must match the windowsize (100)
X_train: (1542, 100, 1)
y_train: (1542, 1)
X_test: (410, 100, 1)
y_test: (410, 1)
Train on 1542 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_48 (LSTM)               (None, 100, 50)           10400     
_________________________________________________________________
lstm_49 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_46 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_47 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trai



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [162]:
losses

{20: 0.0002484219316279561,
 40: 0.0002701372257433832,
 60: 0.0002398072881056223,
 80: 0.00022773387921383467,
 100: 0.0006739147450411465}

In [170]:
pd.Series(losses).index

Int64Index([20, 40, 60, 80, 100], dtype='int64')

In [172]:
px.bar(x=pd.Series(losses).index, y=pd.Series(losses), labels={'x':'window size', 'y':'loss'})

As we can see, if the window size is too big the loss increases which makes sense because data which  is too old may give a wrong intention.
According to the plot a window size  between 20-80 is ok - as small window sizes should run faster, we will choose 20-30.