In [1]:
import pandas as pd

data = pd.read_csv('data/^NDX_raw_data.csv')
data.rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

data_backup = data.iloc[3524:]

data = data.iloc[:3524]
data_copy = data.copy()

print('Data imported and copied.', flush=True)

Data imported and copied.


In [2]:
import numpy as np

## Creating sequences
def create_dataset(dataset, time_step=1, output_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-output_step):
        a = dataset[i:(i+time_step), 0]
        b = dataset[(i+time_step):(i+time_step)+output_step, 0]
        dataX.append(a)
        dataY.append(b)

    return np.array(dataX), np.array(dataY)

period = 60
trend_period = 14
rsi_period = 14
num_features = 3
input_period = 46
output_step = 7
units = 512

In [3]:
print('Initializing the Model...', flush=True)

import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Conv1D, AveragePooling1D, Flatten, Reshape, SimpleRNN, GRU, MaxPooling1D, concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.initializers import Zeros

inputs = Input(shape=(input_period, num_features))

model_cnn = Sequential([
    Conv1D(filters=352, kernel_size=1, activation='relu'),
    Conv1D(filters=352, kernel_size=1, activation='relu'),
    MaxPooling1D(pool_size=3),
    Dense(units=128),
    Flatten()
    #Dense(units=output_step),
    #Reshape((output_step,1))
])

model_bilstm = Sequential([
    Bidirectional(LSTM(units=384, return_sequences=True, activation='tanh', recurrent_activation='sigmoid')),
    Dropout(0.2),
    Flatten()
    #Dense(units=output_step),
    #Reshape((output_step,1))
])

model_bigru = Sequential([
    Bidirectional(GRU(units=128, activation='tanh', return_sequences=True)),
    Dropout(0.4),
    Flatten()
])

model_multilayer_lstm = Sequential([
    LSTM(units=64, return_sequences=True, activation='tanh', recurrent_activation='sigmoid'),
    Dropout(0.2),
    LSTM(units=64, return_sequences=True, activation='tanh', recurrent_activation='sigmoid'),
    Flatten()
])

output_cnn = model_cnn(inputs)
output_bilstm = model_bilstm(inputs)
output_bigru = model_bigru(inputs)
output_multilayer_lstm = model_multilayer_lstm(inputs)

concatenated_outputs = concatenate([output_cnn, output_bilstm, output_bigru, output_multilayer_lstm])

main_model = Sequential([
    Input(shape=(concatenated_outputs.shape[1],)),
    Dense(units=output_step),
    Reshape((output_step,1))
])

final_output = main_model(concatenated_outputs)

functional_pipeline = Model(inputs=inputs, outputs=final_output)

# Compile the pipeline model
functional_pipeline.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

print('Model has been initialized.', flush=True)

Initializing the Model...


2024-06-06 10:29:17.476322: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-06 10:29:17.927857: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-06 10:29:17.929341: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-06 10:29:18.008939: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-06 10:29:18.250126: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


Model has been initialized.


In [4]:
best_epoch_backup = 6

# Load the weights of the model at the chosen epoch
functional_pipeline.load_weights(f'model_weights_6_4_main/model_weights_epoch_{best_epoch_backup:02d}.h5')
print('Backup: Weigths for the best epoch has been loaded.')

Backup: Weigths for the best epoch has been loaded.


In [5]:
print('Importing the bulk data...', flush=True)

import os
import re

directory = 'data-china-from2018-test'

df_data = dict()

for file in os.listdir(directory):
    symbol_pattern = re.match(r'([^_]+)_', file)
    symbol = symbol_pattern.group(1)
    df_data[symbol] = pd.read_csv(os.path.join(directory, file))
    df_data[symbol].rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

print('Bulk data has been imported.', flush=True)

Importing the bulk data...
Bulk data has been imported.


In [6]:
print('Creating the data sets...', flush=True)

dict_X_test, dict_y_test = dict(), dict()

for symbol in df_data:
    X_test, y_test = create_dataset(df_data[symbol][['close']].to_numpy(), time_step=period, output_step=output_step)

    dict_X_test[symbol] = X_test
    dict_y_test[symbol] = y_test

print('Data sets has been created.', flush=True)

Creating the data sets...
Data sets has been created.


In [7]:
print('Decomposing data...', flush=True)

from statsmodels.tsa.seasonal import seasonal_decompose

dict_decompositions = dict()
dict_seasons = dict()

for symbol in df_data:
    decompositions_test = np.array([seasonal_decompose(dict_X_test[symbol][i], model='additive', period=14) for i in range(dict_X_test[symbol].shape[0])])
    seasons_test = np.array([decompositions_test[i].seasonal for i in range(decompositions_test.shape[0])])

    dict_decompositions[symbol] = decompositions_test
    dict_seasons[symbol] = seasons_test

from talib import RSI

dict_rsi = dict()

for symbol in df_data:
    rsi_test = np.array([ RSI(dict_X_test[symbol][i]) for i in range(dict_X_test[symbol].shape[0]) ])

    dict_rsi[symbol] = rsi_test

import statsmodels.api as sm

dict_lowess = dict()

smoothing_lowess_test = sm.nonparametric.lowess

for symbol in df_data:
    lowess_test = np.array([ smoothing_lowess_test(dict_X_test[symbol][i], range(60), frac=0.1)[:, 1] for i in range(dict_X_test[symbol].shape[0]) ])

    dict_lowess[symbol] = lowess_test

print('Decomposition has been applied.', flush=True)

Decomposing data...
Decomposition has been applied.


In [8]:
print('Scaling data...', flush=True)

dict_lowess_dropna = dict()
dict_seasons_cropped = dict()
dict_rsi_dropna = dict()

for symbol in df_data:
    lowess_test_dropna = list()
    seasons_test_cropped = list()
    rsi_test_dropna = list()
    
    for _lowess in dict_lowess[symbol]:
        lowess_test_dropna.append(_lowess[rsi_period:])

    for _season in dict_seasons[symbol]:
        seasons_test_cropped.append(_season[rsi_period:])

    for _rsi in dict_rsi[symbol]:
        rsi_test_dropna.append(_rsi[rsi_period:])

    lowess_test_dropna = np.array(lowess_test_dropna)
    seasons_test_cropped = np.array(seasons_test_cropped)
    rsi_test_dropna = np.array(rsi_test_dropna)

    dict_lowess_dropna[symbol] = lowess_test_dropna
    dict_seasons_cropped[symbol] = seasons_test_cropped
    dict_rsi_dropna[symbol] = rsi_test_dropna

from sklearn.preprocessing import MinMaxScaler

dict_scalers_lowess = dict()
dict_scalers_seasonal = dict()
dict_scalers_target = dict()
dict_lowess_scaled = dict()
dict_seasons_scaled = dict()
dict_targets_scaled = dict()
dict_rsi_scaled = dict()

for symbol in df_data:
    scaler_lowess = list(MinMaxScaler() for i in range(dict_lowess_dropna[symbol].shape[0]))
    lowess_test_scaled = list()

    scaler_seasonal = list(MinMaxScaler() for i in range(dict_seasons_cropped[symbol].shape[0]))
    seasons_test_scaled = list()

    scaler_rsi = list(MinMaxScaler() for i in range(dict_rsi_dropna[symbol].shape[0]))
    rsi_test_scaled = list()

    # we use target values only for comparison issue here
    scaler_target = list(MinMaxScaler() for i in range(dict_y_test[symbol].shape[0]))
    target_test_scaled = list()

    for i in range(dict_lowess_dropna[symbol].shape[0]):
        lowess_test_scaled.append(scaler_lowess[i].fit_transform(dict_lowess_dropna[symbol][i].reshape(-1,1)))

    for j in range(dict_y_test[symbol].shape[0]):
        target_test_scaled.append(scaler_target[j].fit_transform(dict_y_test[symbol][j].reshape(-1,1)))

    for j in range(dict_seasons_cropped[symbol].shape[0]):
        seasons_test_scaled.append(scaler_seasonal[j].fit_transform(dict_seasons_cropped[symbol][j].reshape(-1,1)))

    for i in range(dict_rsi_dropna[symbol].shape[0]):
        rsi_test_scaled.append(scaler_rsi[i].fit_transform(dict_rsi_dropna[symbol][i].reshape(-1,1)))

    lowess_test_scaled = np.array(lowess_test_scaled)
    target_test_scaled = np.array(target_test_scaled)
    seasons_test_scaled = np.array(seasons_test_scaled)
    rsi_test_scaled = np.array(rsi_test_scaled)

    dict_scalers_lowess[symbol] = scaler_lowess
    dict_scalers_seasonal[symbol] = scaler_seasonal
    dict_scalers_target[symbol] = scaler_target
    dict_lowess_scaled[symbol] = lowess_test_scaled
    dict_seasons_scaled[symbol] = seasons_test_scaled
    dict_targets_scaled[symbol] = target_test_scaled
    dict_rsi_scaled[symbol] = rsi_test_scaled

print('Data scaled.', flush=True)

Scaling data...
Data scaled.


In [9]:
print('Preparing input data...', flush=True)

dict_x_input = dict()

for symbol in df_data:
    x_input = list()

    for lowess, season, rsi in zip(dict_lowess_scaled[symbol], dict_seasons_scaled[symbol], dict_rsi_scaled[symbol]):
        x_input.append(np.hstack((lowess, season, rsi)))

    x_input = np.array(x_input)

    dict_x_input[symbol] = x_input

print('Input data has been prepared.', flush=True)

Preparing input data...
Input data has been prepared.


In [10]:
print('Predicting...', flush=True)

dict_predictions = dict()

from tqdm import tqdm
for nb, symbol in tqdm(enumerate(df_data)):
    lowess_predictions = list()

    for j in range(dict_x_input[symbol].shape[0]):
        if (j+1) % 100 == 0:
            print(j+1)
            
        lowess_predictions.append(
            dict_scalers_lowess[symbol][j].inverse_transform(
                functional_pipeline.predict(dict_x_input[symbol][j].reshape(lowess_test_scaled[j].shape[1], input_period, num_features), verbose=0)[0].reshape(1,output_step)
            )
        )
    
    lowess_predictions = np.array(lowess_predictions)

    dict_predictions[symbol] = lowess_predictions

Predicting...


0it [00:00, ?it/s]

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


1it [03:53, 233.30s/it]

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


2it [08:55, 273.55s/it]

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


3it [15:49, 316.55s/it]


In [11]:
print('Adding noise to data...', flush=True)

dict_std_dev_noise = dict()

for symbol in df_data:
    noise_high, _ = create_dataset(df_data[symbol][['high']].to_numpy(), time_step=period, output_step=output_step)
    noise_low, _ = create_dataset(df_data[symbol][['low']].to_numpy(), time_step=period, output_step=output_step)

    std_dev_high = np.array([np.std(val) for val in noise_high])
    std_dev_low = np.array([np.std(val) for val in noise_low])
    
    std_dev_noise = np.array([np.maximum(val_high, val_low) for val_high, val_low in zip(std_dev_high, std_dev_low)])

    dict_std_dev_noise[symbol] = std_dev_noise

print('Noise to data has been added.', flush=True)

Adding noise to data...
Noise to data has been added.


In [14]:
period, input_period

(60, 46)

In [13]:
print('Constructing observations for data...', flush=True)

dict_observations = dict()

for symbol in df_data:
    observation = pd.Series([
        dict_predictions[symbol][i] for i in range(dict_predictions[symbol].shape[0])
    ], index=df_data[symbol].index[period:-output_step])

    dict_observations[symbol] = observation

print('Observations have been constructed.', flush=True)

Constructing observations for data...
Observations have been constructed.


In [15]:
data_output = df_data.copy()

count = 0

for symbol in df_data:
    count += 1

    _obs = [dict_observations[symbol][k+period] for k in range(dict_observations[symbol].shape[0])]
    _res = []
    for obs in _obs:
        _res.append(obs.reshape(-1,output_step)[0])
    
    observation_df = pd.DataFrame([r for r in _res], columns=[f'pred_{i}' for i in range(output_step)], index=df_data[symbol].index[60:-7])
    data_output[symbol] = pd.concat([data_output[symbol], observation_df], axis=1, join='inner')

    noise_df = pd.DataFrame(dict_std_dev_noise[symbol], columns=['noise'], index=df_data[symbol].index[60:-7])
    data_output[symbol] = pd.concat([data_output[symbol], noise_df], axis=1, join='inner')

    #for i in range(output_step):
    #    data_output[symbol][f'pred_{i}'] = data_output[symbol][f'pred_{i}'].shift(53)

    data_output[symbol].to_excel(f'data_w_pred_m6_4-lowess-china-test/{symbol}_w_pred_m5.xlsx')
    
    print(f'{count} done for {symbol}', flush=True)

import pickle

# Pickle the object and save it to a file
with open('data_pred-model_6_4-lowess-china-test.pickle', 'wb') as f:
    pickle.dump(dict_predictions, f)

print('Done.', flush=True)

1 done for 000063.SZ
2 done for 000001.SS
3 done for 000002.SZ
Done.
