In [None]:
#python notebook file which will contain the asset return predictions

### 1. Importing Modules

In [None]:
import random
import os
import math

import pandas as pd
import numpy as np
from numpy.random import seed 
import tensorflow as tf

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
from scipy import stats
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
#setting seed for reproducability
seed = 1
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
#general hyperparameter for lstm
input_steps = 20
output_steps = 1 #one day ahead prediction

### 2. Defining Functions

In [None]:
#function - creating two sequence out of which one predicts the other (from: https://github.com/krishnaik06/Time-Series-Forecasting/blob/master/UnivariateTimeSeries.ipynb)
def split_sequence(sequence, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        # check if we are beyond the sequence
        if out_end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

### 3. Loading Data

In [None]:
# Initialize an empty dictionary to store the loaded DataFrames
data_dict = {}

# Define the directory where the CSV files are saved
directory = '/Users/cemakkus/PycharmProjects/Master/data/'

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.startswith('export_') and filename.endswith('.csv'):
        # Extract the ticker from the filename
        ticker = filename.replace('export_', '').replace('.csv', '')
        
        # Define the full file path
        file_path = os.path.join(directory, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        data_dict[ticker] = df.iloc[output_steps:]  # Select all rows except the last rows that contain nan's
        
        
        print(f'DataFrame for {ticker} loaded from {file_path}')


# At this point, data_dict contains all DataFrames, keyed and sorted by ticker
data_dict = {k: data_dict[k] for k in sorted(data_dict)}


### 4. Feature Engineering for LSTM

In [None]:
#hyperparameter for feature engineering (at this part of the code, so models use exactly the same dates for their predictions)
n_features = 2
window = 10

In [None]:
#feature engineering for other top 3 german components in eurostoxx50 prediction
rolmean_SAP = data_dict['SAP_DE']['1d_return'].rolling(window).mean()
rolstd_SAP = data_dict['SAP_DE']['1d_return'].rolling(window).std()

rolmean_SIE = data_dict['SIE_DE']['1d_return'].rolling(window).mean()
rolstd_SIE = data_dict['SIE_DE']['1d_return'].rolling(window).std()

rolmean_DTE = data_dict['DTE_DE']['1d_return'].rolling(window).mean()
rolstd_DTE = data_dict['DTE_DE']['1d_return'].rolling(window).std()

#adding features as columns in dataframe for dax
data_dict['SAP_DE']['Rolling_Mean'] = rolmean_SAP
data_dict['SAP_DE']['Rolling_Std'] = rolstd_SAP

data_dict['SIE_DE']['Rolling_Mean'] = rolmean_SIE
data_dict['SIE_DE']['Rolling_Std'] = rolstd_SIE

#adding features as columns in dataframe for dax
data_dict['DTE_DE']['Rolling_Mean'] = rolmean_DTE
data_dict['DTE_DE']['Rolling_Std'] = rolstd_DTE

In [None]:
#disregarding rows for which feature values can not be generated - top 3 german components
data_dict['SAP_DE'] = data_dict['SAP_DE'][window-1:]

data_dict['SIE_DE'] = data_dict['SIE_DE'][window-1:]

data_dict['DTE_DE'] = data_dict['DTE_DE'][window-1:]

#do this for all if needed later

### 5. Pre-Processing for ARIMA (Determining Best ARIMA models & Generating Train/Test Sets)

In [None]:
#hyperparameter for train/test split
train_test_ratio = 0.8

In [None]:
#split sap asset returns into train and test set
train_arima_SAP, test_arima_SAP = train_test_split(data_dict['SAP_DE']['1d_return'], test_size=1-train_test_ratio, shuffle = False)

In [None]:
#determining most suitable arima order for dax prediction
autoarima_SAP = auto_arima(train_arima_SAP, 
                      start_p=0, start_q=0,  #minimum p and q
                      test='adf',            #use augmented dickey-fuller test to find optimal 'd'
                      max_p=3, max_q=3,      #maximum p and q
                      d=None,                #let model determine 'd'
                      seasonal=False,        #no seasonality
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)
print(autoarima_SAP.summary())
autoarima_SAP.plot_diagnostics(figsize=(11,7))
plt.show()

### 5. Pre-Processing for LSTM (Splitting Sequences & Generating Train/Test Sets)

In [1]:
#general hyperparameter
input_steps = 20
output_steps = 1 #one day ahead prediction

In [None]:
#creating two sequence out of which one predicts the other - asset return of dax
X_SAP,Y_SAP = split_sequence(data_dict['SAP_DE']['1d_return'], input_steps, output_steps)

#creating two sequence out of which one predicts the other - features of SAP
X1_SAP,Y1_SAP = split_sequence(data_dict['SAP_DE']['Rolling_Mean'], input_steps, output_steps)

X2_SAP,Y2_SAP = split_sequence(data_dict['SAP_DE']['Rolling_Std'], input_steps, output_steps)

#concatenating input variables for lstm
X_ft_SAP = np.concatenate([X_SAP,X1_SAP,X2_SAP])

#reshaping
#X_SAP = np.reshape(X_SAP, (X_SAP.shape[0], input_steps, 1), order='F')
X_ft_SAP = np.reshape(X_ft_SAP, (int(X_ft_SAP.shape[0]/(n_features+1)), input_steps, n_features + 1), order='F')
Y_SAP = np.reshape(Y_SAP, (Y_SAP.shape[0], output_steps), order='F')

#bringing Y in array form
Y_SAP = [i[output_steps - 1] for i in Y_SAP.tolist()]
Y_SAP = np.array(Y_SAP)

#splitting sequence into train, val and test data (60-20-20 split)
# First split: Separate out the test set
X_temp, X_ft_test_SAP, Y_temp, Y_test_SAP = train_test_split(X_ft_SAP, Y_SAP, test_size=1-train_test_ratio, shuffle=False)
# Second split: Split the remaining data into training and validation sets
X_ft_train_SAP, X_ft_val_SAP, Y_train_SAP, Y_val_SAP = train_test_split(X_temp, Y_temp, test_size=0.25, shuffle=False)  # 0.25 * 0.8 = 0.2 # 0.1765 * 0.85 = 0.15