In [1]:
import pandas as pd

data = pd.read_csv('data/^NDX_raw_data.csv')
data.rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

data_backup = data.iloc[3524:]

data = data.iloc[:3524]
data_copy = data.copy()

print('Data imported and copied.', flush=True)

Data imported and copied.


In [2]:
import numpy as np

## Creating sequences
def create_dataset(dataset, time_step=1, output_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-output_step):
        a = dataset[i:(i+time_step), 0]
        b = dataset[(i+time_step):(i+time_step)+output_step, 0]
        dataX.append(a)
        dataY.append(b)

    return np.array(dataX), np.array(dataY)

period = 60
trend_period = 14
num_features = 1
input_period = 46
output_step = 7
units = 1024

In [3]:
print('Initializing the Model...', flush=True)

import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Build LSTM model
model = Sequential([
    LSTM(units=units, input_shape=(input_period, num_features)),
    #LSTM(units=units, return_sequences=True, input_shape=(input_period, num_features)),
    #Dropout(0.2),
    #LSTM(units=units, return_sequences=False),
    #Dropout(0.2),
    Dense(output_step)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define a ModelCheckpoint callback to save weights at the end of each epoch
checkpoint = ModelCheckpoint(filepath='model_weights_2/model_weights_epoch_{epoch:02d}.h5', 
                            save_best_only=True, save_weights_only=True)

# Adding early stopping to prevent overfitting
#early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

print('Model has been initialized.', flush=True)

Initializing the Model...


2024-05-22 09:06:15.206715: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-22 09:06:15.278532: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 09:06:15.278590: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 09:06:15.282014: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-22 09:06:15.296045: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


Model has been initialized.


In [4]:
best_epoch_backup = 94

# Load the weights of the model at the chosen epoch
model.load_weights(f'model_weights_2/model_weights_epoch_{best_epoch_backup:02d}.h5')
print('Backup: Weigths for the best epoch has been loaded.', flush=True)

Backup: Weigths for the best epoch has been loaded.


In [5]:
print('Importing the bulk data...', flush=True)

import os
import re

directory = 'data-china-from2018-test'

df_data = dict()

for file in os.listdir(directory):
    symbol_pattern = re.match(r'([^_]+)_', file)
    symbol = symbol_pattern.group(1)
    df_data[symbol] = pd.read_csv(os.path.join(directory, file))
    df_data[symbol].rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

print('Bulk data has been imported.', flush=True)

Importing the bulk data...
Bulk data has been imported.


In [6]:
print('Creating the data sets...', flush=True)

dict_X_test, dict_y_test = dict(), dict()

for symbol in df_data:
    X_test, y_test = create_dataset(df_data[symbol][['close']].to_numpy(), time_step=period, output_step=output_step)

    dict_X_test[symbol] = X_test
    dict_y_test[symbol] = y_test

print('Data sets has been created.', flush=True)

Creating the data sets...
Data sets has been created.


In [7]:
print('Decomposing data...', flush=True)

from statsmodels.tsa.seasonal import seasonal_decompose

dict_decompositions = dict()
dict_trends = dict()

for symbol in df_data:
    decompositions_test = np.array([seasonal_decompose(dict_X_test[symbol][i], model='additive', period=14) for i in range(dict_X_test[symbol].shape[0])])
    trends_test = np.array([decompositions_test[i].trend for i in range(decompositions_test.shape[0])])

    dict_decompositions[symbol] = decompositions_test
    dict_trends[symbol] = trends_test

print('Decomposition has been applied.', flush=True)

Decomposing data...
Decomposition has been applied.


In [8]:
print('Scaling data...', flush=True)

dict_trends_dropna = dict()

for symbol in df_data:
    trends_test_dropna = list()
    
    for trend in dict_trends[symbol]:
        trends_test_dropna.append(trend[~np.isnan(trend)])

    trends_test_dropna = np.array(trends_test_dropna)

    dict_trends_dropna[symbol] = trends_test_dropna

from sklearn.preprocessing import MinMaxScaler

dict_scalers_trend = dict()
dict_scalers_target = dict()
dict_trends_scaled = dict()
dict_targets_scaled = dict()

for symbol in df_data:
    scaler_trend = list(MinMaxScaler() for i in range(dict_trends_dropna[symbol].shape[0]))
    trends_test_scaled = list()

    # we use target values only for comparison issue here
    scaler_target = list(MinMaxScaler() for i in range(dict_y_test[symbol].shape[0]))
    target_test_scaled = list()

    for i in range(dict_trends_dropna[symbol].shape[0]):
        trends_test_scaled.append(scaler_trend[i].fit_transform(dict_trends_dropna[symbol][i].reshape(-1,1)))

    for j in range(dict_y_test[symbol].shape[0]):
        target_test_scaled.append(scaler_target[j].fit_transform(dict_y_test[symbol][j].reshape(-1,1)))

    trends_test_scaled = np.array(trends_test_scaled)
    target_test_scaled = np.array(target_test_scaled)

    dict_scalers_trend[symbol] = scaler_trend
    dict_scalers_target[symbol] = scaler_target
    dict_trends_scaled[symbol] = trends_test_scaled
    dict_targets_scaled[symbol] = target_test_scaled

print('Data scaled.', flush=True)

Scaling data...
Data scaled.


In [9]:
print('Predicting...', flush=True)

dict_predictions = dict()

from tqdm import tqdm
for nb, symbol in tqdm(enumerate(df_data)):
    trend_predictions = list()

    #print(f'{nb+1} predicting for {symbol}', flush=True)
    
    for j in range(dict_trends_scaled[symbol].shape[0]):
        #print(f'{j+1} for {symbol}')
        trend_predictions.append(
            dict_scalers_trend[symbol][j].inverse_transform(
                model.predict(dict_trends_scaled[symbol][j].reshape(dict_trends_scaled[symbol][j].shape[1], input_period, num_features), verbose=0)
            )
        )
    
    trend_predictions = np.array(trend_predictions)

    dict_predictions[symbol] = trend_predictions

print('Predictions have been completed.', flush=True)

Predicting...


1it [04:20, 260.90s/it]

Predictions have been completed.





In [10]:
## Iteration cycle:
print('Iterating...', flush=True)

dict_trends_dropna_iter = dict()
dict_trends_scaled_iter = dict()

for symbol in df_data:
    trends_iter = list()
    for i in range(dict_trends_dropna[symbol].shape[0]):
        trends_iter.append(
            dict_scalers_trend[symbol][i].transform(
                np.append(dict_trends_dropna[symbol][i][output_step:], dict_predictions[symbol][i][0]).reshape(-1,1)
            )
        )
    dict_trends_scaled_iter[symbol] = np.array(trends_iter)

dict_predictions_iter = dict()

from tqdm import tqdm
for nb, symbol in tqdm(enumerate(df_data)):
    trend_predictions_iter = list()

    for j in range(dict_trends_scaled_iter[symbol].shape[0]):
        trend_predictions_iter.append(
            dict_scalers_trend[symbol][j].inverse_transform(
                model.predict(dict_trends_scaled_iter[symbol][j].reshape(dict_trends_scaled_iter[symbol][j].shape[1], input_period, num_features), verbose=0)
            )
        )

    dict_predictions_iter[symbol] = np.array(trend_predictions_iter)

print('Iterations have been completed.', flush=True)

Iterating...


1it [04:22, 262.49s/it]

Iterations have been completed.





In [11]:
print('Constructing observations for data...', flush=True)

dict_observations = dict()

for symbol in df_data:
    observation = pd.Series([
        dict_predictions_iter[symbol][i][0] for i in range(dict_predictions_iter[symbol].shape[0])
    ], index=df_data[symbol].index[output_step:-period])

    dict_observations[symbol] = observation

print('Observations have been constructed.', flush=True)

Constructing observations for data...
Observations have been constructed.


In [12]:
print('Adding noise to data...', flush=True)

dict_std_dev_noise = dict()

for symbol in df_data:
    noise_high, _ = create_dataset(df_data[symbol][['high']].to_numpy(), time_step=period, output_step=output_step)
    noise_low, _ = create_dataset(df_data[symbol][['low']].to_numpy(), time_step=period, output_step=output_step)

    std_dev_high = np.array([np.std(val) for val in noise_high])
    std_dev_low = np.array([np.std(val) for val in noise_low])
    
    std_dev_noise = np.array([np.maximum(val_high, val_low) for val_high, val_low in zip(std_dev_high, std_dev_low)])

    dict_std_dev_noise[symbol] = std_dev_noise

print('Noise to data has been added.', flush=True)

Adding noise to data...
Noise to data has been added.


In [13]:
print('Calculating confidence interval...', flush=True)

dict_induced_gap = dict()

for symbol in df_data:
    induced_highs = np.array([dict_observations[symbol][i+7] + dict_std_dev_noise[symbol][i] for i in range(dict_std_dev_noise[symbol].shape[0])])
    induced_lows = np.array([dict_observations[symbol][i+7] - dict_std_dev_noise[symbol][i] for i in range(dict_std_dev_noise[symbol].shape[0])])
    
    induced_highs = pd.DataFrame(induced_highs, index=df_data[symbol].index[7:-60])
    induced_lows = pd.DataFrame(induced_lows, index=df_data[symbol].index[7:-60])

    dict_induced_gap[symbol] = {'induced_high':induced_highs, 'induced_low':induced_lows}

print('Induced gap calculated.', flush=True)

Calculating confidence interval...
Induced gap calculated.


In [14]:
data_output = df_data.copy()

count = 0

for symbol in df_data:
    count += 1

    observation_df = pd.DataFrame([dict_observations[symbol][k+7] for k in range(dict_observations[symbol].shape[0])], columns=[f'pred_{i}' for i in range(output_step)], index=df_data[symbol].index[7:-60])
    data_output[symbol] = pd.concat([data_output[symbol], observation_df], axis=1, join='inner')

    noise_df = pd.DataFrame(dict_std_dev_noise[symbol], columns=['noise'], index=df_data[symbol].index[7:-60])
    data_output[symbol] = pd.concat([data_output[symbol], noise_df], axis=1, join='inner')

    for i in range(output_step):
        data_output[symbol][f'pred_{i}'] = data_output[symbol][f'pred_{i}'].shift(53)

    data_output[symbol].to_excel(f'data_w_pred_m5-china-test/{symbol}_w_pred_m5.xlsx')
    
    print(f'{count} done for {symbol}', flush=True)

import pickle

# Pickle the object and save it to a file
with open('data_pred-china-bulk.pickle', 'wb') as f:
    pickle.dump(dict_predictions, f)

print('Done.', flush=True)

1 done for 000157.SZ
Done.
