### Stanford Paper on LSTM Neural Networks for stock prices volatility prediction

http://cs230.stanford.edu/projects_fall_2019/reports/26254244.pdf

### Tutorial for building an LSTM neural network for time-series prediction

https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/

### Importing the required libraries

In [6]:
import pandas as pd
from pandas.plotting import autocorrelation_plot
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

%matplotlib inline

from tensorflow import keras
from tensorflow.keras import layers

# Datetime

import datetime

# Scikit-Learn

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

# GARCH model

from arch import arch_model

# Keras

from keras.models import Model
from keras.layers import *
from tensorflow.keras.utils import plot_model

# Tensorflow

import tensorflow as tf

### Reading the csv file with the financial data

In [47]:
df = pd.read_csv('processed_stock_data.csv')
# 只保留需要的列
df = df[['股票代码_Stkcd', '日期_Date', '开盘价_Oppr', '收盘价_Clpr', '最高价_Hipr', '最低价_Lopr', '成交量_Trdvol']]

# 重命名列名
df = df.rename(columns={
    '股票代码_Stkcd': 'StockCode',
    '日期_Date': 'Date', 
    '开盘价_Oppr': 'Open',
    '收盘价_Clpr': 'Close',
    '最高价_Hipr': 'High',
    '最低价_Lopr': 'Low',
    '成交量_Trdvol': 'Volume'
})
df.set_index('Date', inplace=True)
df.index = pd.to_datetime(df.index)
print (df.head())
print (df.shape)

            StockCode  Open  Close  High   Low      Volume
Date                                                      
2020-07-01         68  3.52   3.44  3.52  3.37  25483461.0
2020-07-02         68  3.40   3.45  3.45  3.38  17180117.0
2020-07-03         68  3.45   3.42  3.46  3.38  17171744.0
2020-07-06         68  3.42   3.64  3.76  3.42  48155956.0
2020-07-07         68  3.71   3.61  3.79  3.53  42723215.0
(131424, 6)


As we see here, we have 254 columns, corresponding to the 254 business days for which we have financial data and 10 columns, which are the 10 financial indicators we have.

# Data Cleaning

In [11]:
# df = df.rename(columns={
    
#     df.columns[0]: 'Date',
#     df.columns[1]:'Open',
#     df.columns[2]: 'Close',
#     df.columns[3]:'High',
#     df.columns[4]:'Low',
#     df.columns[5]: 'Volume',
#     df.columns[6]: 'RSI14',
#     df.columns[7]:'SMA14',
#     df.columns[8]: 'EMA14',
#     df.columns[9]:'MACD_sl',
#     df.columns[10]:'MACD_h'

# })

# print (df.head())

### Converting the Date column into a Date

In [13]:
# df['Date'] =  pd.to_datetime(df['Date'], format='%Y%m%d')

# Feature Engineering

## Logarithmic Features

### Log Returns

In [17]:
df['Log_Returns'] = np.log(df.Close) - np.log(df.Close.shift(1))

print(df.head())
print(df.tail())

            StockCode  Open  Close  High   Low      Volume  Log_Returns
Date                                                                   
2020-07-01         68  3.52   3.44  3.52  3.37  25483461.0          NaN
2020-07-02         68  3.40   3.45  3.45  3.38  17180117.0     0.002903
2020-07-03         68  3.45   3.42  3.46  3.38  17171744.0    -0.008734
2020-07-06         68  3.42   3.64  3.76  3.42  48155956.0     0.062343
2020-07-07         68  3.71   3.61  3.79  3.53  42723215.0    -0.008276
            StockCode   Open  Close   High    Low     Volume  Log_Returns
Date                                                                     
2023-12-25     873122  27.60  28.44  28.99  27.60  2348147.0     0.015592
2023-12-26     873122  28.57  31.73  31.80  28.02  5269096.0     0.109466
2023-12-27     873122  31.00  30.03  31.47  29.20  3044626.0    -0.055066
2023-12-28     873122  30.29  29.18  31.58  29.18  2689421.0    -0.028713
2023-12-29     873122  28.40  29.21  29.80  28.20  2

### Log Trading Range

In [19]:
df['Log_Trading_Range'] = np.log(df.High) - np.log(df.Low)

print(df.head())

            StockCode  Open  Close  High   Low      Volume  Log_Returns  \
Date                                                                      
2020-07-01         68  3.52   3.44  3.52  3.37  25483461.0          NaN   
2020-07-02         68  3.40   3.45  3.45  3.38  17180117.0     0.002903   
2020-07-03         68  3.45   3.42  3.46  3.38  17171744.0    -0.008734   
2020-07-06         68  3.42   3.64  3.76  3.42  48155956.0     0.062343   
2020-07-07         68  3.71   3.61  3.79  3.53  42723215.0    -0.008276   

            Log_Trading_Range  
Date                           
2020-07-01           0.043548  
2020-07-02           0.020499  
2020-07-03           0.023393  
2020-07-06           0.094778  
2020-07-07           0.071068  


### Log Volume Change

In [21]:
df['Log_Volume_Change'] = np.log(df.Volume) - np.log(df.Volume.shift(1))

print(df.head())

            StockCode  Open  Close  High   Low      Volume  Log_Returns  \
Date                                                                      
2020-07-01         68  3.52   3.44  3.52  3.37  25483461.0          NaN   
2020-07-02         68  3.40   3.45  3.45  3.38  17180117.0     0.002903   
2020-07-03         68  3.45   3.42  3.46  3.38  17171744.0    -0.008734   
2020-07-06         68  3.42   3.64  3.76  3.42  48155956.0     0.062343   
2020-07-07         68  3.71   3.61  3.79  3.53  42723215.0    -0.008276   

            Log_Trading_Range  Log_Volume_Change  
Date                                              
2020-07-01           0.043548                NaN  
2020-07-02           0.020499          -0.394277  
2020-07-03           0.023393          -0.000487  
2020-07-06           0.094778           1.031180  
2020-07-07           0.071068          -0.119702  


## Volatility

### Previous 10-day Volatility

In [24]:
df['Previous_10_Day_Volatility'] = df['Log_Returns'].rolling(window = 10).std()

print(df.tail())

            StockCode   Open  Close   High    Low     Volume  Log_Returns  \
Date                                                                        
2023-12-25     873122  27.60  28.44  28.99  27.60  2348147.0     0.015592   
2023-12-26     873122  28.57  31.73  31.80  28.02  5269096.0     0.109466   
2023-12-27     873122  31.00  30.03  31.47  29.20  3044626.0    -0.055066   
2023-12-28     873122  30.29  29.18  31.58  29.18  2689421.0    -0.028713   
2023-12-29     873122  28.40  29.21  29.80  28.20  2472944.0     0.001028   

            Log_Trading_Range  Log_Volume_Change  Previous_10_Day_Volatility  
Date                                                                          
2023-12-25           0.049135          -0.163689                    0.095524  
2023-12-26           0.126548           0.808232                    0.103461  
2023-12-27           0.074866          -0.548481                    0.091971  
2023-12-28           0.079040          -0.124052                 

### Previous 30-day Volatility

In [26]:
df['Previous_30_Day_Volatility'] = df['Log_Returns'].rolling(window = 30).std()

print(df.head())

            StockCode  Open  Close  High   Low      Volume  Log_Returns  \
Date                                                                      
2020-07-01         68  3.52   3.44  3.52  3.37  25483461.0          NaN   
2020-07-02         68  3.40   3.45  3.45  3.38  17180117.0     0.002903   
2020-07-03         68  3.45   3.42  3.46  3.38  17171744.0    -0.008734   
2020-07-06         68  3.42   3.64  3.76  3.42  48155956.0     0.062343   
2020-07-07         68  3.71   3.61  3.79  3.53  42723215.0    -0.008276   

            Log_Trading_Range  Log_Volume_Change  Previous_10_Day_Volatility  \
Date                                                                           
2020-07-01           0.043548                NaN                         NaN   
2020-07-02           0.020499          -0.394277                         NaN   
2020-07-03           0.023393          -0.000487                         NaN   
2020-07-06           0.094778           1.031180                         N

### Next 10-days volatility

In [28]:
df['Next_10_Days_Volatility'] = df['Log_Returns'].iloc[::-1].rolling(window = 10).std().iloc[::-1]

print(df.head())

            StockCode  Open  Close  High   Low      Volume  Log_Returns  \
Date                                                                      
2020-07-01         68  3.52   3.44  3.52  3.37  25483461.0          NaN   
2020-07-02         68  3.40   3.45  3.45  3.38  17180117.0     0.002903   
2020-07-03         68  3.45   3.42  3.46  3.38  17171744.0    -0.008734   
2020-07-06         68  3.42   3.64  3.76  3.42  48155956.0     0.062343   
2020-07-07         68  3.71   3.61  3.79  3.53  42723215.0    -0.008276   

            Log_Trading_Range  Log_Volume_Change  Previous_10_Day_Volatility  \
Date                                                                           
2020-07-01           0.043548                NaN                         NaN   
2020-07-02           0.020499          -0.394277                         NaN   
2020-07-03           0.023393          -0.000487                         NaN   
2020-07-06           0.094778           1.031180                         N

In [29]:
df.dropna(inplace = True)
print(df.head())

            StockCode  Open  Close  High   Low      Volume  Log_Returns  \
Date                                                                      
2020-08-12         68  3.66   3.70  3.77  3.62  11132835.0     0.000000   
2020-08-13         68  3.69   3.69  3.76  3.67   9055000.0    -0.002706   
2020-08-14         68  3.69   3.69  3.72  3.62  10875834.0     0.000000   
2020-08-17         68  3.70   3.76  3.79  3.68  11417667.0     0.018792   
2020-08-18         68  3.76   3.89  3.96  3.72  33591392.0     0.033990   

            Log_Trading_Range  Log_Volume_Change  Previous_10_Day_Volatility  \
Date                                                                           
2020-08-12           0.040601          -0.112175                    0.025826   
2020-08-13           0.024227          -0.206582                    0.025869   
2020-08-14           0.027250           0.183226                    0.025169   
2020-08-17           0.029453           0.048619                    0.0157

In [30]:
df.to_csv('output.csv')

## GARCH 

In [32]:
## Garch predictions for the entire dataset of SPX

### Building a new dataframe for splitting the dataframe in test and training data

In [49]:

X = df[df.first_valid_index():df.last_valid_index()- datetime.timedelta(1500)]

print (X.tail())

KeyError: "Cannot get left slice bound for non-unique label: Timestamp('2020-07-01 00:00:00')"

### Building a GARCH model

In [None]:
GARCH_model = arch_model(X['Log_Returns'], vol='Garch', p=1, q=1, rescale=False)

x = GARCH_model.fit(disp='off')

### Making rolling predictions using the GARCH Model

In [None]:
model_fit = GARCH_model.fit(disp='off')
GARCH_rolling_predictions = pd.DataFrame(
    model_fit.forecast(horizon=len(X) - 50).variance,
    index=X.index[50:],
    columns=['GARCH_rolling_predictions']
)

print(GARCH_rolling_predictions.head())

### Making forward-looking predictions using the GARCH Model

In [None]:
GARCH_forward_looking_predictions = GARCH_model.predict(h=1500)

print(GARCH_forward_looking_predictions.head())

### Renaming one of the columns of the GARCH Model Dataframe

In [None]:
GARCH_rolling_predictions.rename(columns={'Series':'GARCH_rolling_predictions'}, inplace =True)

GARCH_forward_looking_predictions.rename(columns={'Log_Returns':'GARCH_forward_looking_predictions'}, inplace =True)

### Adding the new feature to the current dataframe

In [None]:
df = pd.concat([df, GARCH_rolling_predictions], axis=1)
df = pd.concat([df, GARCH_forward_looking_predictions], axis=1)

### Replacing Nan values with 0s for the GARCH Predictions columns

Rational for this from Keras's creator:

https://stackoverflow.com/questions/52570199/multivariate-lstm-with-missing-values

In [None]:
df['GARCH_forward_looking_predictions'] =  df['GARCH_forward_looking_predictions'].fillna(0)
df['GARCH_rolling_predictions'] =  df['GARCH_rolling_predictions'].fillna(0)

### Checking the results of our transformations

In [None]:
print (df.tail())

In [None]:
# Notes

In [None]:
# Better to predict VIX prices than realized volatility of SPX

In [None]:
#Use it to predict VIX

Look at relationship of (5, 10, 30) realized volatility of SPX versus VIX prices (Plot)

Calculate in Excel

### Building a new dataframe for splitting the dataframe in test and training data

### Using dropna on several columns

In [None]:
def list_columns_to_dropna(df, column_list):
    
    for column in column_list:
        
        df = df[df[column].notna()]
        
    return df

In [None]:
column_list = ['Open', 'Log_Returns','Previous_10_Day_Volatility','Next_10_Days_Volatility','Previous_30_Day_Volatility']

df = list_columns_to_dropna(df, column_list)

print (df.head())
print (df.shape)

### Exporting the final dataframe to csv

In [None]:
df.to_csv('output.csv')

# Exploratory Data Analysis (EDA)

### Plotting out the S&P 500 Prices from 1990 to 2020

In [None]:
df['Close'].plot(label = 'S&P 500', figsize =(16,8), title = 'S&P 500 Stock Prices from 1990 to 2020')
plt.show()

### Plotting out the 10-days forward looking volatility of  S&P 500 Prices from 1990 to 2020

In [None]:
df['Next_10_Days_Volatility'].plot(label = 'S&P 500', figsize =(16,8), title = '10-days forward looking volatility of  S&P 500 Prices from 1990 to 2020')
plt.show()

# Feature Selection

### Pearson Correlation Matrix

In [None]:
def build_pearson_correlation_matrix_of_dataframe(size_x,size_y,dataframe,correlation_target,correlation_minimum_criteria):

    # Using Pearson Correlation

    plt.figure(figsize=(size_x,size_y))
    cor = dataframe.corr()
    sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
    plt.savefig('Images/pearson_correlation_matrix.png', bbox_inches='tight')
    plt.show()

    # Correlation with output variable

    target = abs(cor[correlation_target])

    #Selecting and printing highly correlated features

    relevant_features = target[target>correlation_minimum_criteria]
    print(relevant_features)

In [None]:
build_pearson_correlation_matrix_of_dataframe(20,20,df,"Next_10_Days_Volatility",0.2)

# Data Preparation

### Splitting the data into train and test sets

In [None]:
X = np.array(df.drop(["Next_10_Days_Volatility",'Low','High','Close','Open','Volume','MACD_h','MACD_sl','RSI14','SMA14','EMA14'], axis=1).values)
y = np.array(df["Next_10_Days_Volatility"].values).reshape(-1, 1) 

test_size = 1500

X_train = X[test_size:,]
X_test = X[:test_size,]
y_train = y[test_size:]
y_test = y[:test_size]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

### Defining a function to get lagged versions of the features

This function increases the number of features of the dataset by "lagging" every feature.

In [None]:
def get_lagged(x, y, t, s):
    
    lagged = []
    
    for i in range(x.shape[0] - t):
        
        if i == x.shape[0] - t:
            
            break
            
        for k in range(t):
            
            if k < t:
                
                lagged.append(x[i+k])
                
    lagged = np.array(lagged).reshape(s)
    
    return lagged, y[:lagged.shape[0],]

In [None]:
N = 30

X_train, y_train = get_lagged(X_train, y_train, N, (X_train.shape[0]-N, N*X_train.shape[1]))
X_test, y_test = get_lagged(X_test, y_test, N, (X_test.shape[0]-N, N*X_test.shape[1]))

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
T = 4

X_train, y_train = get_lagged(X_train, y_train, T, (X_train.shape[0]-T, T, X_train.shape[1]))
X_test, y_test = get_lagged(X_test, y_test, T, (X_test.shape[0]-T, T, X_test.shape[1]))

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

# LSTM

### Building the LSTM Model

In [None]:
inputLSTM = Input(shape=(X_train.shape[1], X_train.shape[2]))
y = LSTM(200, return_sequences=True)(inputLSTM)
y = LSTM(200)(y)
y = Dense(1)(y)
lstm = Model(inputs=inputLSTM, outputs=y)
lstm.summary()

### Plotting out the LSTM network

In [None]:
plot_model(lstm, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

### Declaring the parameters of the LSTM Model

In [None]:
lstm.compile(optimizer=keras.optimizers.Adam(lr=0.01),loss=tf.keras.losses.MeanSquaredError(),metrics=[tf.keras.metrics.RootMeanSquaredError()])

Get data for SPX where you have Open Close and Volumen (1960)

### Fitting the LSTM Model

In [None]:
hist = lstm.fit(X_train, y_train,batch_size=700,epochs=60,verbose=1,validation_split=0.3,shuffle=False)

### Plotting the RSME for training and validation

In [None]:
plt.plot(hist.history['root_mean_squared_error'])
plt.plot(hist.history['val_root_mean_squared_error'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

### Printing out the predictions made by the model

In [None]:
for ind, i in enumerate(lstm.predict(X_test)):
    
    print('Prediction: ' + str('{:.2f}'.format(round(100 * round(i[0], 4),3))) + ',    ' + 'Actual Value: ' + str('{:.2f}'.format(round(100 * round(y_test[ind][0],4),2))))

### Printing out the results of the model

In [None]:
def printing_out_results_of_a_model(model,X_test,y_test):
    
    y_pred = model.predict(X_test)
    
    # Print the R2 score 

    print ("R2 score:\n") 
    print (('{:.2f}'.format((100*(r2_score(y_test, y_pred))))) + " %")

    print ("\n")
    
    # Print the RMSE

    print ("RMSE:\n")
    print (math.sqrt(mean_squared_error(y_test, y_pred)))
    
    print ('\n')
    
    # Print the mean squared error
    
    print ("Mean Squared Error:\n")
    print (mean_squared_error(y_test, y_pred))

In [None]:
printing_out_results_of_a_model(lstm, X_test, y_test)