In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Conv1D, MaxPooling1D, Flatten
from sklearn.metrics import mean_squared_error, mean_absolute_error
from data_pipeline.mongodb_accessor import StockDataMongoDB
from tqdm import tqdm
from tensorflow.keras.regularizers import l2
from keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from dotenv import load_dotenv
load_dotenv()

# Load the data
mongo_accessor = StockDataMongoDB()
data = mongo_accessor.get_stock_data("QQQ")
data = data[['close', 'volume']]

# Calculate additional features
data['price_diff'] = data['close'].diff()  # Difference between consecutive prices
data['direction'] = (data['price_diff'] > 0).astype(int)  # Direction label (1 for up, 0 for down)

# Drop NaN values created by `diff`
data = data.sort_index().dropna()

dataset = data.values

Connected to MongoDB: stock_data.daily_prices
Fetched 5000 records for symbol 'QQQ'.


In [18]:
print(data.sort_index())
print(sum(data['direction']) / 3653)

          close     volume  price_diff  direction
0     518.58002   29002300     8.35001          1
1     510.23001   36389800    -1.00000          0
2     511.23001   29117000    -4.37998          0
3     515.60999   34584000    -6.95001          0
4     522.56000   33839600    -7.03998          0
...         ...        ...         ...        ...
4994   37.61000  107270600    -0.02000          0
4995   37.63000   78460600     0.41000          1
4996   37.22000   95731300    -0.40000          0
4997   37.62000   79988700     0.21000          1
4998   37.41000   99334600     0.47000          1

[4999 rows x 4 columns]
0.7547221461812209


In [25]:
# Separate features and target
features = data[['close', 'volume', 'price_diff']].values
target = data['direction'].values  # Predicting direction (classification)

# Separate scalers for price, volume, and price_diff
price_scaler = MinMaxScaler(feature_range=(0, 1))
volume_scaler = MinMaxScaler(feature_range=(0, 1))
diff_scaler = MinMaxScaler(feature_range=(-1, 1))  # Scaled between -1 and 1 for difference

scaled_price = price_scaler.fit_transform(features[:, 0].reshape(-1, 1))
scaled_volume = volume_scaler.fit_transform(features[:, 1].reshape(-1, 1))
scaled_diff = diff_scaler.fit_transform(features[:, 2].reshape(-1, 1))

# Combine scaled features
scaled_data = np.hstack((scaled_price, scaled_volume, scaled_diff))

# Create sequences for LSTM input
def create_sequences(data, labels, seq_length):
    X = []
    y = []
    for i in range(seq_length, len(data)):
        X.append(data[i-seq_length:i])  # Last `seq_length` rows as input
        y.append(labels[i])  # Current row's direction as the target
    return np.array(X), np.array(y)

seq_length = 60
X, y = create_sequences(scaled_data, target, seq_length)
print(y)
# Ensure shapes are correct
print("Input shape (X):", X.shape)  # (n_samples, seq_length, n_features)
print("Output shape (y):", y.shape)  # (n_samples,)


[1 0 1 ... 0 1 1]
Input shape (X): (4939, 60, 3)
Output shape (y): (4939,)


In [30]:
# Modified model to accept 2 features
# Build the model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dropout, Dense, BatchNormalization

def build_model(input_shape):
    model = Sequential()
    
    # First convolutional block
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())  # Helps with convergence
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    
    # Second convolutional block
    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    
    # LSTM layers
    model.add(LSTM(units=64, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=32, return_sequences=False))
    model.add(Dropout(0.2))
    
    # Fully connected output layer
    model.add(Dense(units=1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


# Define callbacks
early_stopping = EarlyStopping(
    monitor='loss',  # Metric to monitor (e.g., 'val_loss', 'val_accuracy')
    patience=3,          # Number of epochs with no improvement before stopping
    verbose=1,
    restore_best_weights=True  # Restore the weights of the best epoch
)

# Initialization phase
initialization_size = 2000
X_initial = X[:initialization_size]
y_initial = y[:initialization_size]

model = build_model((X_initial.shape[1], 3))  # Note the 2 for 2 features
model.fit(X_initial, y_initial, batch_size=32, epochs=20, verbose=1, callbacks=[early_stopping])

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.5492 - loss: 0.6909
Epoch 2/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5480 - loss: 0.6873
Epoch 3/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5511 - loss: 0.6830
Epoch 4/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5876 - loss: 0.6771
Epoch 5/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5989 - loss: 0.6712
Epoch 6/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5643 - loss: 0.6812
Epoch 7/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5675 - loss: 0.6751
Epoch 8/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5887 - loss: 0.6726
Epoch 9/20
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7fab5071b3d0>

In [None]:
from tqdm import tqdm
import numpy as np

def directional_backtest(model, data, price_scaler, volume_scaler, diff_scaler, seq_length, batch_size=64):
    predictions = []
    actuals = []
    correct_predictions = 0
    total_predictions = 1

    X_buffer = []
    y_buffer = []

    with tqdm(total=len(data) - initialization_size  - 1, desc='Directional Backtesting') as pbar:
        for i in range(initialization_size , len(data) - 1):
            # Prepare sequences for price, volume, and price_diff
            current_sequence_price = data[i-seq_length:i, 0].reshape(-1, 1)
            current_sequence_volume = data[i-seq_length :i, 1].reshape(-1, 1)
            current_sequence_diff = data[i-seq_length :i, 2].reshape(-1, 1)

            # Scale features
            scaled_price_seq = price_scaler.transform(current_sequence_price)
            scaled_volume_seq = volume_scaler.transform(current_sequence_volume)
            scaled_diff_seq = diff_scaler.transform(current_sequence_diff)

            # Combine and reshape
            current_sequence = np.hstack((scaled_price_seq, scaled_volume_seq, scaled_diff_seq))
            current_sequence = np.reshape(current_sequence, (1, seq_length, 3))  # Adjust for 3 features

            # Make prediction
            predicted_value = model.predict(current_sequence, verbose=0)
            # print(predicted_value)
            up_threshold = 0.75
            down_threshold = 0.25
            
            predicted_up = predicted_value[0][0] >= up_threshold  # Binary decision based on threshold
            predicted_down = predicted_value[0][0] <= down_threshold 
            predicted_no = predicted_value[0][0] < up_threshold and predicted_value[0][0] > down_threshold
            
            # Get current and next actual prices
            current_price = data[i, 0]
            next_price = data[i + 1, 0]

            # Actual direction
            price_went_up = next_price > current_price

            if not predicted_no:
                # Check directional accuracy
                if price_went_up == predicted_up:
                    correct_predictions += 1
                total_predictions += 1

            current_accuracy = (correct_predictions / total_predictions) * 100
            pbar.set_postfix({'Accuracy': f'{current_accuracy:.2f}%'})
            pbar.update(1)

            # Update model with new data (price, volume, diff)
            X_buffer.append(current_sequence[0])
            y_new = 1 if next_price > current_price else 0  # Binary target
            y_buffer.append(y_new)

            if len(X_buffer) == batch_size:
                X_batch = np.array(X_buffer)
                y_batch = np.array(y_buffer)
                model.fit(X_batch, y_batch, epochs=20, batch_size=batch_size, verbose=0)
                X_buffer = []
                y_buffer = []

    # Final update for any remaining buffered data
    if X_buffer:
        X_batch = np.array(X_buffer)
        y_batch = np.array(y_buffer)
        model.fit(X_batch, y_batch, epochs=20, batch_size=len(X_buffer), verbose=0)

    final_accuracy = (correct_predictions / total_predictions) * 100

    # Print results
    print(f"\nDirectional Prediction Results:")
    print(f"Total Predictions: {total_predictions}")
    print(f"Correct Predictions: {correct_predictions}")
    print(f"Final Accuracy: {final_accuracy:.2f}%")
    print(f"\nBaseline Comparison:")
    print(f"Random Guessing Expected Accuracy: 50.00%")
    print(f"Model Improvement over Random: {(final_accuracy - 50):.2f}%")

    return final_accuracy

# Run the backtest
accuracy = directional_backtest(model, dataset, price_scaler, volume_scaler, diff_scaler, seq_length)


Directional Backtesting:  89%|███████████████████████████████████████████████████████████████████████████████████▍          | 2660/2998 [01:59<00:13, 25.61it/s, Accuracy=59.14%]