In [1]:
import pandas as pd

data = pd.read_csv('data/^NDX_raw_data.csv')
data.rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

data_backup = data.iloc[3524:]

data = data.iloc[:3524]
data_copy = data.copy()

print('Data imported and copied.', flush=True)

Data imported and copied.


In [2]:
import numpy as np

## Creating sequences
def create_dataset(dataset, time_step=1, output_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-output_step):
        a = dataset[i:(i+time_step), 0]
        b = dataset[(i+time_step):(i+time_step)+output_step, 0]
        dataX.append(a)
        dataY.append(b)

    return np.array(dataX), np.array(dataY)

period = 60
trend_period = 14
rsi_period = 14
num_features = 1
input_period = 60
output_step = 7
units = 512

In [3]:
print('Initializing the Model...', flush=True)

import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Conv1D, AveragePooling1D, Flatten, Reshape, SimpleRNN, GRU, MaxPooling1D, concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.initializers import Zeros

inputs = Input(shape=(input_period, num_features))

model_cnn = Sequential([
    Conv1D(filters=128, kernel_size=1, activation='relu'),
    Conv1D(filters=128, kernel_size=1, activation='relu'),
    MaxPooling1D(pool_size=3),
    Dense(units=128),
    Flatten()
    #Dense(units=output_step),
    #Reshape((output_step,1))
])

model_bilstm = Sequential([
    Bidirectional(LSTM(units=512, return_sequences=True, activation='tanh', recurrent_activation='sigmoid')),
    Dropout(0.2),
    Flatten()
    #Dense(units=output_step),
    #Reshape((output_step,1))
])

model_bigru = Sequential([
    Bidirectional(GRU(units=512, activation='tanh', return_sequences=True)),
    Dropout(0.4),
    Flatten()
])

model_multilayer_lstm = Sequential([
    LSTM(units=256, return_sequences=True, activation='tanh', recurrent_activation='sigmoid'),
    Dropout(0.2),
    LSTM(units=256, return_sequences=True, activation='tanh', recurrent_activation='sigmoid'),
    Flatten()
])

output_cnn = model_cnn(inputs)
output_bilstm = model_bilstm(inputs)
output_bigru = model_bigru(inputs)
output_multilayer_lstm = model_multilayer_lstm(inputs)

concatenated_outputs = concatenate([output_cnn, output_bilstm, output_bigru, output_multilayer_lstm])

main_model = Sequential([
    Input(shape=(concatenated_outputs.shape[1],)),
    Dense(units=output_step, kernel_initializer=Zeros(), use_bias=False),
    Reshape((output_step,1))
])

final_output = main_model(concatenated_outputs)

functional_pipeline = Model(inputs=inputs, outputs=final_output)

# Compile the pipeline model
functional_pipeline.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

print('Model has been initialized.', flush=True)

Initializing the Model...


2024-06-06 10:32:37.485966: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-06 10:32:38.026909: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-06 10:32:38.027146: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-06 10:32:38.120321: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-06 10:32:38.252692: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


Model has been initialized.


In [4]:
best_epoch_backup = 98

# Load the weights of the model at the chosen epoch
functional_pipeline.load_weights(f'model_weights_6_4_simple-w-savgol/model_weights_epoch_{best_epoch_backup:02d}.weights.h5')
print('Backup: Weigths for the best epoch has been loaded.')



Backup: Weigths for the best epoch has been loaded.


In [5]:
print('Importing the bulk data...', flush=True)

import os
import re

directory = 'data-china-from2018-test'

df_data = dict()

for file in os.listdir(directory):
    symbol_pattern = re.match(r'([^_]+)_', file)
    symbol = symbol_pattern.group(1)
    df_data[symbol] = pd.read_csv(os.path.join(directory, file))
    df_data[symbol].rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

print('Bulk data has been imported.', flush=True)

Importing the bulk data...
Bulk data has been imported.


In [6]:
print('Creating the data sets...', flush=True)

dict_X_test, dict_y_test = dict(), dict()

for symbol in df_data:
    X_test, y_test = create_dataset(df_data[symbol][['close']].to_numpy(), time_step=period, output_step=output_step)

    dict_X_test[symbol] = X_test
    dict_y_test[symbol] = y_test

print('Data sets has been created.', flush=True)

Creating the data sets...
Data sets has been created.


In [7]:
print('Decomposing data...', flush=True)

from scipy.signal import savgol_filter

# Apply Savitzky-Golay filter
window_length = 11  # Window length (must be odd)
polyorder = 2      # Polynomial order

dict_savgol = dict()

for symbol in df_data:
    savgol_test = np.array([ savgol_filter(dict_X_test[symbol][i], window_length, polyorder) for i in range(dict_X_test[symbol].shape[0]) ])

    dict_savgol[symbol] = savgol_test

print('Decomposition has been applied.', flush=True)

Decomposing data...
Decomposition has been applied.


In [8]:
print('Scaling data...', flush=True)

dict_savgol_dropna = dict()

for symbol in df_data:
    savgol_test_dropna = list()
    
    for _savgol in dict_savgol[symbol]:
        savgol_test_dropna.append(_savgol)

    savgol_test_dropna = np.array(savgol_test_dropna)

    dict_savgol_dropna[symbol] = savgol_test_dropna

from sklearn.preprocessing import MinMaxScaler

dict_scalers_savgol = dict()
dict_scalers_target = dict()
dict_savgol_scaled = dict()
dict_targets_scaled = dict()

for symbol in df_data:
    scaler_savgol = list(MinMaxScaler() for i in range(dict_savgol_dropna[symbol].shape[0]))
    savgol_test_scaled = list()

    # we use target values only for comparison issue here
    scaler_target = list(MinMaxScaler() for i in range(dict_y_test[symbol].shape[0]))
    target_test_scaled = list()

    for i in range(dict_savgol_dropna[symbol].shape[0]):
        savgol_test_scaled.append(scaler_savgol[i].fit_transform(dict_savgol_dropna[symbol][i].reshape(-1,1)))

    for j in range(dict_y_test[symbol].shape[0]):
        target_test_scaled.append(scaler_target[j].fit_transform(dict_y_test[symbol][j].reshape(-1,1)))

    savgol_test_scaled = np.array(savgol_test_scaled)
    target_test_scaled = np.array(target_test_scaled)

    dict_scalers_savgol[symbol] = scaler_savgol
    dict_scalers_target[symbol] = scaler_target
    dict_savgol_scaled[symbol] = savgol_test_scaled
    dict_targets_scaled[symbol] = target_test_scaled

print('Data scaled.', flush=True)

Scaling data...
Data scaled.


In [9]:
print('Preparing input data...', flush=True)

dict_x_input = dict()

for symbol in df_data:
    x_input = list()

    for savgol in dict_savgol_scaled[symbol]:
        x_input.append(savgol)

    x_input = np.array(x_input)

    dict_x_input[symbol] = x_input

print('Input data has been prepared.', flush=True)

Preparing input data...
Input data has been prepared.


In [10]:
print('Predicting...', flush=True)

dict_predictions = dict()

from tqdm import tqdm
for nb, symbol in tqdm(enumerate(df_data)):
    savgol_predictions = list()

    for j in range(dict_x_input[symbol].shape[0]):
        if (j+1) % 100 == 0:
            print(j+1)
            
        savgol_predictions.append(
            dict_scalers_savgol[symbol][j].inverse_transform(
                functional_pipeline.predict(dict_x_input[symbol][j].reshape(savgol_test_scaled[j].shape[1], input_period, num_features), verbose=0)[0].reshape(1,output_step)
            )
        )
    
    savgol_predictions = np.array(savgol_predictions)

    dict_predictions[symbol] = savgol_predictions

Predicting...


0it [00:00, ?it/s]

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


1it [06:03, 363.28s/it]

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


2it [14:02, 431.53s/it]

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


3it [19:41, 393.67s/it]


In [11]:
print('Adding noise to data...', flush=True)

dict_std_dev_noise = dict()

for symbol in df_data:
    noise_high, _ = create_dataset(df_data[symbol][['high']].to_numpy(), time_step=period, output_step=output_step)
    noise_low, _ = create_dataset(df_data[symbol][['low']].to_numpy(), time_step=period, output_step=output_step)

    std_dev_high = np.array([np.std(val) for val in noise_high])
    std_dev_low = np.array([np.std(val) for val in noise_low])
    
    std_dev_noise = np.array([np.maximum(val_high, val_low) for val_high, val_low in zip(std_dev_high, std_dev_low)])

    dict_std_dev_noise[symbol] = std_dev_noise

print('Noise to data has been added.', flush=True)

Adding noise to data...
Noise to data has been added.


In [12]:
print('Constructing observations for data...', flush=True)

dict_observations = dict()

for symbol in df_data:
    observation = pd.Series([
        dict_predictions[symbol][i] for i in range(dict_predictions[symbol].shape[0])
    ], index=df_data[symbol].index[input_period:-output_step])

    dict_observations[symbol] = observation

print('Observations have been constructed.', flush=True)

Constructing observations for data...
Observations have been constructed.


In [13]:
data_output = df_data.copy()

count = 0

for symbol in df_data:
    count += 1

    _obs = [dict_observations[symbol][k+input_period] for k in range(dict_observations[symbol].shape[0])]
    _res = []
    for obs in _obs:
        _res.append(obs.reshape(-1,output_step)[0])
    
    observation_df = pd.DataFrame([r for r in _res], columns=[f'pred_{i}' for i in range(output_step)], index=df_data[symbol].index[60:-7])
    data_output[symbol] = pd.concat([data_output[symbol], observation_df], axis=1, join='inner')

    noise_df = pd.DataFrame(dict_std_dev_noise[symbol], columns=['noise'], index=df_data[symbol].index[60:-7])
    data_output[symbol] = pd.concat([data_output[symbol], noise_df], axis=1, join='inner')

    #for i in range(output_step):
    #    data_output[symbol][f'pred_{i}'] = data_output[symbol][f'pred_{i}'].shift(53)

    data_output[symbol].to_excel(f'data_w_pred_m6_4-simple-w-savgol-china-test/{symbol}_w_pred_m5.xlsx')
    
    print(f'{count} done for {symbol}', flush=True)

import pickle

# Pickle the object and save it to a file
with open('data_pred-model_6_4-simple-w-savgol-china-test.pickle', 'wb') as f:
    pickle.dump(dict_predictions, f)

print('Done.', flush=True)

1 done for 000063.SZ
2 done for 000001.SS
3 done for 000002.SZ
Done.
