In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
import itertools
import random

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.metrics import mean_squared_error

2024-05-20 17:49:34.120495: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tickers_split = [
 ['ABBV', 'ACN', 'AEP', 'AIZ', 'ALLE', 'AMAT', 'AMP', 'AMZN', 'AVB'],
 ['AVY', 'AXP', 'BDX', 'BF-B', 'BMY', 'BR', 'CARR', 'CDW', 'CE', 'CHTR'],
 ['CNC', 'CNP', 'COP', 'CTAS', 'CZR', 'DG', 'DPZ', 'DXC', 'META', 'FTV'],
 ['GOOG', 'GPC', 'HIG', 'HST', 'JPM', 'KR', 'OGN', 'PG', 'PPL', 'PRU'],
 ['PYPL', 'ROL', 'ROST', 'UNH', 'URI', 'V', 'VRSK', 'WRK', 'XOM', 'IVV'],
 ['IWM', 'EWU', 'EWG', 'EWL', 'EWQ', 'IEUS', 'EWJ', 'EWT', 'MCHI'],
 ['INDA', 'EWY', 'EWA', 'EWH', 'EWZ', 'EWC', 'IEMG', 'LQD', 'HYG', 'SHY'],
 ['IEF', 'TLT', 'SEGA.L', 'IEAA.L', 'HIGH.L', 'JPEA.L', 'IAU', 'SLV', 'GSG', 'REET'],
 ['ICLN', 'IXN', 'IGF', 'IUVL.L', 'IUMO.L', 'SPMV.L', 'IEVL.L', 'IEFM.L', 'MVEU.L', 'XLK'],
 ['XLF', 'XLV', 'XLE', 'XLY', 'XLI', 'XLC', 'XLU', 'XLP', 'XLB', 'VXX']
 ]

In [3]:
# Reproducibility: set seeds for...
seed_number = 0
np.random.seed(seed_number) # numpy
random.seed(seed_number) # Python's built-in random number generator
tf.random.set_seed(seed_number) #tensorflow

# Parameters
data_name = 'm6'

data_file = 'data_original_m6.csv'
index_col = 0
shrink = 1
feature_range = (0, 1)
train_test_split = 0.8

# Training
lstm_units = 50
epochs = 20
batch_size = 10

max_window = 3

# Load data
data_original = pd.read_csv(data_file, index_col=index_col)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

In [None]:
# Experiments

# Windows
look_back_new = list(range(1, max_window+1))
look_front_new = look_back_new

look_back_orig = [int(x*2) for x in look_back_new]
look_front_orig = [0] * len(look_back_orig)

back_list = list(itertools.chain(*zip(look_back_new, look_back_orig)))
front_list = list(itertools.chain(*zip(look_front_new, look_front_orig)))

# Initialize an empty DataFrame to store the results
results = pd.DataFrame(columns=['ticker', 'look_back', 'look_front', 'trainScore', 'testScore'])

# Iterate over tickers_split
for tickers in tickers_split:

    # Iterate over tickers
    for ticker in tickers:

        # Iterate over back and front windows
        for look_back, look_front in zip(back_list, front_list):

            # Shrink data
            nrows = int(len(data_original) * shrink)
            data = data_original.iloc[:nrows][ticker]

            # Preprocess data
            scaler = MinMaxScaler(feature_range=feature_range)
            data = scaler.fit_transform(data.values.reshape(-1, 1))

            # Split data into train and test sets
            train_size = int(len(data) * train_test_split)
            test_size = len(data) - train_size
            train, test = data[0:train_size,:], data[train_size:len(data),:]

            # if look_front == 0: # classical windowing
            #     continue

            # Create training data
            X_train, Y_train = [], []
            for i in range(look_back, len(train) - look_front):
                a = train[i - look_back:i, 0]
                b = train[i:i + look_front, 0]
                X_train.append(np.concatenate((a, b)))
                Y_train.append(train[i, 0])
            trainX_orig, trainY = np.array(X_train), np.array(Y_train)
            # continue

            # Create testing data
            X_test, Y_test = [], []
            for i in range(look_back, len(test) - look_front):
                a = test[i - look_back:i, 0]
                if look_front != 0: # BTF
                    b = test[i - look_front:i, 0] # Get past data
                else:
                    b = test[i:i + look_front, 0] # Get future data
                X_test.append(np.concatenate((a, b)))
                Y_test.append(test[i, 0])
            testX_orig, testY = np.array(X_test), np.array(Y_test)
            # continue

            # Reshape data input to be [samples, time steps, features]
            trainX = np.reshape(trainX_orig, (trainX_orig.shape[0], 1, trainX_orig.shape[1]))
            testX = np.reshape(testX_orig, (testX_orig.shape[0], 1, testX_orig.shape[1]))

            # Build model
            model = Sequential()
            model.add(LSTM(lstm_units, input_shape=(trainX.shape[0], look_back+look_front)))
            model.add(Dense(1))
            model.compile(loss='mean_squared_error', optimizer='adam')

            # Train model
            model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=0)

            # Make predictions
            trainPredict = model.predict(trainX)
            testPredict = model.predict(testX)

            # Invert predictions
            trainPredict = scaler.inverse_transform(trainPredict)
            trainY = scaler.inverse_transform([trainY])
            testPredict = scaler.inverse_transform(testPredict)
            testY = scaler.inverse_transform([testY])

            # Calculate root mean squared error
            trainScore = np.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
            testScore = np.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
            print('Train Score: %.2f RMSE' % (trainScore))
            print('Test Score: %.2f RMSE' % (testScore))

            # Store the detailed results for the current value
            results.loc[len(results)] = [ticker, look_back, look_front, trainScore, testScore]

    # Saving name
    saving_name = 'results_m6_' + tickers[0] + '_' + tickers[-1] + '_e' + str(epochs) + '.csv'

    # Save 'results'
    results.to_csv(saving_name, index=True)

In [5]:
# Slice the DataFrame to get every other row starting from the first row
results_btf = results.iloc[::2].reset_index(drop=True)

# Slice the DataFrame to get every other row starting from the second row
results_normal = results.iloc[1::2].reset_index(drop=True)

# Create a new DataFrame with the given expressions
df_diff = pd.DataFrame({
    'trainScore_diff': np.round(((results_btf['trainScore']/results_normal['trainScore'] - 1)*100), 4),
    'testScore_diff': np.round((results_btf['testScore']/results_normal['testScore'] - 1)*100, 4)
})

def color_cells(val):
    color = 'red' if val < 0 else 'green'
    return 'color: %s' % color

# Apply the color_cells function to df_diff
df_diff_styled = df_diff.style.applymap(color_cells)
print('Value of the Future (VoF), the percentual difference from original windowing to BtF on RMSE:\n')
print('(RED values = BTF beats original windowing strategy)')
df_diff_styled

Value of the Future (VoF), the percentual difference from original windowing to BtF on RMSE:

(RED values = BTF beats original windowing strategy)


Unnamed: 0,trainScore_diff,testScore_diff
0,-51.2678,-8.7536
1,-51.7457,-4.546
2,-54.0247,3.8166
3,-52.8111,-12.3226
4,-43.5487,-9.6755
5,-45.4406,-10.9625
6,-40.145,-12.5551
7,-49.8321,-8.2406
8,-51.3571,-8.1129
9,-43.7562,-10.0453
