# Recurrent Neural Network - Long short-term memory (LSTM) 

The purpose of this study was to build a LSTM Model to Predict Stock Prices

In [189]:
import pandas as pd # to aide in loading and manipulating our datasets.
import numpy as np # for scientific computation
import matplotlib.pyplot as plt # for plotting graphs
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

## PART 1 - Data Handling

In [190]:
# Loading the Dataset
dataset_train = pd.read_csv('dataset_2012_2018.csv')

In [191]:
# Drop null values
dataset_train = dataset_train.dropna()
# another way: dataset_train.dropna(inplace=True)

In [192]:
# The shape property returns a tuple representing the dimensionality of the DataFrame. 
# The format of shape is (rows, columns)
dataset_train.shape

(1759, 7)

In [193]:
# Check the head of our dataset to give us a glimpse into the kind of dataset we’re working with.
dataset_train.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-01-03,25.370001,26.34,25.32,26.110001,23.034264,12754300
1,2012-01-04,25.91,26.58,25.91,26.459999,23.343037,12351500
2,2012-01-05,26.309999,26.370001,25.870001,26.110001,23.034264,8568600
3,2012-01-06,26.25,26.25,25.639999,25.690001,22.663742,8532100
4,2012-01-09,26.08,26.969999,25.93,26.879999,23.713554,26046600


In [194]:
# Selecting the Open column that we’ll use in our modeling
dataset_train = dataset_train.iloc[:, 1:2].values 
# notice ==>> df.iloc[:, 1:2] returns a dataframe whereas df.iloc[:, 1] returns a series 
# notice ==>"Open" column is the starting price while the Close column is the final price of a stock on a particular trading day. 

In [195]:
dataset_train 


array([[25.370001],
       [25.91    ],
       [26.309999],
       ...,
       [12.      ],
       [12.56    ],
       [13.01    ]])

#### Feature Scaling
 * We have to scale our data for optimal performance. 
 * Scikit MinMaxScaler will scale our dataset to numbers between zero and one.

In [196]:
# Normalization of the training set - Transform features by scaling each feature to a given range.
normalizer = MinMaxScaler(feature_range=(0,1))
train_set_scaled = normalizer.fit_transform(dataset_train) 

#### Creating Data with Timesteps
* LSTM expect our data to be in a specific format, usually a 3D array. 
* We start by creating data in 90 timesteps and converting it into an array using NumPy. 
* Next, we convert the data into a 3D dimension array with X_train samples, 90 timestamps, and one feature at each step.

In [197]:
# Creating data with timesteps for the train_set_scaled
# timesteps ==>> to train the algorithm, I will check 90 rows on each step  
x_train = []
y_real = []

for i in range(45, 1759): 
    x_train.append(train_set_scaled[i-45:i, 0])  
    # in other words => first iteration: [90-90:90, collumn index: 0] (from zero until 89)
    # => segunda iteration: [91-90:91, collumn index: 0] (from 1 until 90)
    # => segunda iteration: [92-90:92, collumn index: 0] (from 2 until 91)
    # etc
    y_real.append(train_set_scaled[i, 0])  # in other words => stores the index that it wants to predict 
     
 
x_train, y_real = np.array(x_train), np.array(y_real) # I want the data to be in an numpy array
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1)) 
# Reshape method will create dimensions (it will have 3 dimensions: the two we had and one more)
# NOTICE == >> we have to reshape our data to 3D because tensorflow requires it to run

## PART 2 - Data Analysis - Building the LSTM

In [198]:
#In order to build the LSTM, we need to import a couple of modules from Keras:

from keras.models import Sequential #Sequential for initializing the neural network
from keras.layers import Dense #Dense for adding a densely connected neural network layer
from keras.layers import LSTM #LSTM for adding the Long Short-Term Memory layer
from keras.layers import Dropout #Dropout for adding dropout layers that prevent overfitting

### Building the LSTM
* We add the LSTM layer and later add a few Dropout layers to prevent overfitting. 
* We add the LSTM layer with the following arguments:
* 50 units which is the dimensionality of the output space
* return_sequences=True (determines to return the last output in the output sequence, or the full sequence)
* input_shape as the shape of our training set. 

In [None]:
regressor = Sequential() #  4 layers - each will have 50 neurons.  30% is the probability that each neuron will become inactivate during each epoch.
#the result will be a bit worse, but in the testing phase i will have a better generalization
regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (x_train.shape[1], 1)))
regressor.add(Dropout(0.3))  

regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.3))  

regressor.add(LSTM(units = 50)) # here i don't need the history cos I only need the result
regressor.add(Dropout(0.3))

# we add the Dense layer that specifies the output of 1 unit, the output is 1 
regressor.add(Dense(units = 1, activation = 'linear')) # last layer is a dense layer with linear because I have continuous values

# we compile our model using the popular adam optimizer and set the loss as the mean_squarred_error. 
#This will compute the mean of the squared errors.  
regressor.compile(optimizer = 'rmsprop', loss = 'mean_squared_error',metrics = ['mean_absolute_error'])
# mean_squared_error = measures the average of the squares of the errors / 
# Notice - if one of the errors is too big, it will impact the results a lot. 
# This is why "mean absolute percentage error" is used as a great solution to calculate the error.
regressor.fit(x_train, y_real, epochs = 70, batch_size = 32) 

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70

### Predicting Future Stock using the Test Set
### In order to predict future stock prices, we will need to do a couple of steps: 
* Step 1) Load the test set, get rid of null values and choose the Open collumn 
* Step 2) Merge the training set and the test set on the 0 axis.
* Step 3) Set the time step as 90 (as done previously) and reshape
* Step 4) Use MinMaxScaler to transform and reshape the dataset as done previously
* Step 5) After making the predictions we will have to use inverse_transform to get back the stock prices in normal readable format.

#### Step 1) Load the test set, get rid of null values and choose the Open collumn 

In [None]:
# Now loading the testing set
dataset_test = pd.read_csv('dataset_2019.csv')

In [None]:
# Drop null values
dataset_test = dataset_test.dropna()

In [None]:
dataset_test.shape

In [None]:
dataset_test.head()

In [None]:
# Selecting the Open column that we’ll use in our modeling
real_stock_price = dataset_test.iloc[:, 1:2].values  
# notice ==>> df.iloc[:, 1:2] returns a dataframe whereas df.iloc[:, 1] returns a series 
# notice ==>"Open" column is the starting price while the Close column is the final price of a stock on a particular trading day. 

In [None]:
dataset_test

In [None]:
dataset_test.Open

In [None]:
pd.DataFrame({'Open':dataset_train[0]})

In [None]:
pd.DataFrame(dataset_train)

#### Step 2) Merge the training set and the test set on the 0 axis.

In [None]:
dataset_totality = pd.concat((pd.DataFrame(dataset_train), dataset_test['Open']), axis = 0)
 

#### Step 3) Set the time step as 90 (as done previously) reshape

In [None]:
inputs = dataset_totality[len(dataset_totality) - len(dataset_test) - 30:].values
# No meu dataset total eu quero garantir q pego só as linhas do dataset treino.  
inputs = inputs.reshape(-1, 1) #reshape to turn it into a vector 

#### Step 4) Use MinMaxScaler to transform and reshape the dataset as done previously

In [None]:
inputs = normalizer.transform(inputs) # AFINAL = usar aqui so transform ou fit_transform 
# Normalization of the test set - Transform features by scaling each feature to a given range.

In [None]:
inputs.shape

In [None]:
#### Step 5) use inverse_transform to get back the stock prices in normal readable format.

In [None]:
# Creating data with timesteps for the train_set_scaled
# timesteps ==>> checks 90 rows on each step on every sliding window
x_test = []
for i in range(45, 251):
    x_test.append(inputs[i-45:i, 0]) # in other words => [90-90:90, collumn index: 0]

x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
 
# NOTICE == >> we have to reshape our data to 3D DataFrame (panel)
# A data frame is a two-dimensional data structure, that is, the data is aligned in rows and columns in a table, a three-dimensional data structure is called panel.

#### Step 5) After making the predictions we will have to use inverse_transform to get back the stock prices in normal readable format. 

In [None]:
# Making the predictions
predicted_stock_price = regressor.predict(x_test)
# Get back the stock prices in normal readable format
predicted_stock_price = normalizer.inverse_transform(predicted_stock_price) 

## PART 3 - Valuation of the Analysis - Plootting results


In [None]:
plt.plot(real_stock_price, color = 'purple', label = 'Petrobras Stock Price')
plt.plot(predicted_stock_price, color = 'green', label = 'Predicted Petrobras Stock Price')
plt.title('Petrobras Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel('Petrobras Stock Price')
plt.legend()
plt.show()
 

In [None]:
# NOTICE:
# If I increase the range in the loop ==>> I can predict shorter periods of time ==>> but with MORE accuracy   
# If I decrease the range in the loop ==>> I can predict longer pediods of time ==>> but with LESS accuracy 