In [1]:
import pandas as pd

data = pd.read_csv('data/^NDX_raw_data.csv')
data.rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

data_backup = data.iloc[3524:]

data = data.iloc[:3524]
data_copy = data.copy()

print('Data imported and copied.', flush=True)

Data imported and copied.


In [2]:
import numpy as np

## Creating sequences
def create_dataset(dataset, time_step=1, output_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-output_step):
        a = dataset[i:(i+time_step), 0]
        b = dataset[(i+time_step):(i+time_step)+output_step, 0]
        dataX.append(a)
        dataY.append(b)

    return np.array(dataX), np.array(dataY)

period = 14
trend_period = 14
rsi_period = 14
num_features = 1
input_period = 14
output_step = 7
units = 512

In [3]:
X, y = create_dataset(data[['close']].to_numpy(), time_step=period, output_step=output_step)

data_input = np.array([np.concatenate((X[i], y[i]), axis=0) for i in range(X.shape[0])])

In [4]:
X.shape, data_input.shape, X[0], data_input[0]

((3503, 14),
 (3503, 21),
 array([1463.56994629, 1496.57995605, 1501.26000977, 1514.26000977,
        1530.65002441, 1520.45996094, 1539.77001953, 1524.7800293 ,
        1534.07995605, 1532.01000977, 1553.61999512, 1552.86999512,
        1546.59997559, 1530.42004395]),
 array([1463.56994629, 1496.57995605, 1501.26000977, 1514.26000977,
        1530.65002441, 1520.45996094, 1539.77001953, 1524.7800293 ,
        1534.07995605, 1532.01000977, 1553.61999512, 1552.86999512,
        1546.59997559, 1530.42004395, 1531.20996094, 1553.66003418,
        1519.22998047, 1491.56994629, 1496.40002441, 1493.07995605,
        1487.83996582]))

In [5]:
data_input.shape

(3503, 21)

In [6]:
np.where(np.diff(data_input[0]) > 0, 1, -1)

array([ 1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,
        1, -1, -1])

In [7]:
data_diff = list()

for data in data_input:
    _diff = list()
    for i in range(data.shape[0]-1):
        epsilon = 1000 * (data[i+1] - data[i]) / data[i]
        if data[i+1] - data[i] > 2 * epsilon:
            _diff.append(1)
        elif data[i+1] - data[i] < -2 * epsilon:
            _diff.append(-1)
        else:
            _diff.append(0)
    data_diff.append(_diff)

data_diff = np.array(data_diff)

In [8]:
data_diff.shape, data_diff[:40]

((3503, 20),
 array([[0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1],
        [0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0],
        [0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1],
        [0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0],
        [1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0],
        [0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1],
        [1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0],
        [0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0],
        [1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1],
        [0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1],
        [1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0],
        [1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0],
        [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1],
        [0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1],
       

In [9]:
data_diff_input = np.array([ data_diff[i][:14] for i in range(data_diff.shape[0]) ])
data_diff_target = np.array([ data_diff[i][14:] for i in range(data_diff.shape[0]) ])

In [10]:
data_diff_input.shape, data_diff_target.shape

((3503, 14), (3503, 6))

In [11]:
from tensorflow.keras.utils import to_categorical

pred_mapped = pred_categorical + 1  # Shift labels to 0, 1, 2
pred_one_hot = to_categorical(pred_mapped)

2024-06-05 10:52:28.822888: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-05 10:52:28.904186: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-05 10:52:28.904236: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-05 10:52:28.907672: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-05 10:52:28.922481: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


NameError: name 'pred_categorical' is not defined

In [11]:
pred_one_hot.shape

(3503, 3)

## Model

In [35]:
input_period, num_features

(14, 1)

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model_main = Sequential([
    LSTM(units=128, input_shape=(input_period, num_features)),
    Dropout(0.2),
    Dense(6, activation='softmax')
])

model_main.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model_main.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               66560     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 6)                 774       
                                                                 
Total params: 67334 (263.02 KB)
Trainable params: 67334 (263.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
history = model_main.fit(data_diff_input, data_diff_target, epochs=30, batch_size=32, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## data testing

In [15]:
data_backup = pd.read_csv('data/^NDX_raw_data.csv')
data_backup.rename(columns={'Date': 'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'}, inplace=True)

data_backup_copy = data_backup.copy()

In [22]:
X_test, y_test = create_dataset(data_backup[['close']].to_numpy(), time_step=period+1, output_step=output_step)

In [23]:
X_test.shape, y_test.shape

((5011, 15), (5011, 7))

In [24]:
data_diff_test = list()

for data in X_test:
    _diff = list()
    for i in range(data.shape[0]-1):
        epsilon = 1000 * (data[i+1] - data[i]) / data[i]
        if data[i+1] - data[i] > 2 * epsilon:
            _diff.append(1)
        elif data[i+1] - data[i] < -2 * epsilon:
            _diff.append(-1)
        else:
            _diff.append(0)
    data_diff_test.append(_diff)

data_diff_test = np.array(data_diff_test)

In [30]:
data_diff_test.shape

(5011, 14)

In [29]:
data_diff_test[0].shape

(14,)

In [36]:
model_main.predict(data_diff_test[0].reshape(1, input_period, num_features))



array([[0.17687415, 0.1103949 , 0.1673206 , 0.17547093, 0.22794065,
        0.14199881]], dtype=float32)

In [37]:
predictions_test = list()

for j in range(data_diff_test.shape[0]):
    if (j+1) % 100 == 0:
        print(j+1)
        
    predictions_test.append(
        model_main.predict(data_diff_test[j].reshape(1, input_period, num_features), verbose=0)[0].reshape(1,6)
    )

predictions_test = np.array(predictions_test)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000


In [45]:
predictions_test

array([[[0.17687415, 0.1103949 , 0.1673206 , 0.17547093, 0.22794065,
         0.14199881]],

       [[0.17687145, 0.11039322, 0.1673257 , 0.1754736 , 0.22793718,
         0.14199881]],

       [[0.17687081, 0.11039282, 0.16732255, 0.17547566, 0.22793983,
         0.1419983 ]],

       ...,

       [[0.21387587, 0.19814724, 0.16645236, 0.12928161, 0.1241545 ,
         0.1680884 ]],

       [[0.176876  , 0.11039437, 0.16732234, 0.17546742, 0.22793956,
         0.14200029]],

       [[0.2139515 , 0.19812053, 0.16641216, 0.12929773, 0.12415482,
         0.16806318]]], dtype=float32)

In [41]:
# Convert probabilities to class labels
predictions_test_labels = np.array([ np.argmax(predictions_test[i], axis=1) for i in range(predictions_test.shape[0]) ])

In [44]:
predictions_test_labels.reshape(1,-1)[0]

array([4, 4, 4, ..., 0, 4, 0])

In [43]:
res = pd.Series(savgol_predictions_test_labels.reshape(1,-1)[0], index=data_backup.index[period:-output_step])

In [49]:
res

14      1
15      1
16      1
17      1
18      2
       ..
5021    1
5022    1
5023    1
5024    1
5025    1
Length: 5012, dtype: int64

In [62]:
data_output = data_backup.copy()

In [63]:
res.astype(np.int32)

14      1
15      1
16      1
17      1
18      2
       ..
5021    1
5022    1
5023    1
5024    1
5025    1
Length: 5012, dtype: int32

In [64]:
data_output = pd.concat([data_output, res.astype(np.int16)], axis=1, join='outer')

In [65]:
data_output.iloc[:40]

Unnamed: 0,date,open,high,low,close,volume,0
0,2004-01-02 00:00:00-05:00,1474.160034,1479.589966,1458.51001,1463.569946,1666780000,
1,2004-01-05 00:00:00-05:00,1474.550049,1496.579956,1474.189941,1496.579956,2362910000,
2,2004-01-06 00:00:00-05:00,1492.410034,1504.469971,1486.589966,1501.26001,2273220000,
3,2004-01-07 00:00:00-05:00,1498.380005,1514.449951,1491.199951,1514.26001,2294280000,
4,2004-01-08 00:00:00-05:00,1524.060059,1530.650024,1513.339966,1530.650024,2683950000,
5,2004-01-09 00:00:00-05:00,1516.589966,1541.839966,1512.859985,1520.459961,2482760000,
6,2004-01-12 00:00:00-05:00,1524.540039,1540.099976,1515.859985,1539.77002,2284010000,
7,2004-01-13 00:00:00-05:00,1540.349976,1541.829956,1513.209961,1524.780029,2385700000,
8,2004-01-14 00:00:00-05:00,1531.930054,1536.839966,1521.910034,1534.079956,2099970000,
9,2004-01-15 00:00:00-05:00,1524.72998,1545.650024,1515.060059,1532.01001,2235590000,


In [67]:
data_output.to_excel(f'data_w_pred_m6_6-classification/test_w_pred_m5.xlsx')