In [1]:
import pandas as pd
from utils.sp_scraper import scrape_sp500_symbols
import yfinance as yf
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
import tensorflow as tf
import optuna
from optuna.integration import KerasPruningCallback
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, LeakyReLU, Dropout, Input
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Replace '.' with '-' in ticker symbols
sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scrape_sp500_symbols())]
ticker_encoding = {ticker: i for i, ticker in enumerate(sp_tickers)}
data = pd.DataFrame()
# Initialize scaler dictionaries to store scalers for each ticker
scalers = {}
for ticker in tqdm(sp_tickers, desc = "Downloading data", unit="ticker"):
    # Initialize scalers
    scaler_close = StandardScaler()
    scaler_future_price = StandardScaler()

    def get_ticker_data(ticker):
        # Get max data for the ticker
        ticker_data = yf.Ticker(ticker).history(period="1y")
        return ticker_data

    try:
        # Get ticker data
        ticker_data = get_ticker_data(ticker)
    except:
        time.sleep(10)
        ticker_data = get_ticker_data(ticker)

    # Make data a column instead of index
    ticker_data.reset_index(inplace=True)

    # Make columns lowercase
    ticker_data.columns = ticker_data.columns.str.lower()

    # Add a price in 30 days column
    ticker_data['price in 30 days'] = ticker_data['close'].shift(-30)

    # Drop NA rows (last 30 days)
    ticker_data.dropna(inplace=True)

    # Replace a ticker column
    ticker_data['ticker'] = ticker_encoding[ticker]

    # Scale close column
    stock_close = ticker_data.filter(["close"])
    # Convert to numpy array
    stock_close = stock_close.values
    # Scale the data
    scaled_close = scaler_close.fit_transform(stock_close)
    # Insert scaled data into the original dataframe
    ticker_data['close'] = scaled_close

    # Scale prediction column
    stock_price_in_30_days = ticker_data.filter(["price in 30 days"])
    # Convert to numpy array
    stock_price_in_30_days = stock_price_in_30_days.values
    # Scale the data
    scaled_price_in_30_days = scaler_future_price.fit_transform(stock_price_in_30_days)
    # Insert scaled data into the original dataframe
    ticker_data['price in 30 days'] = scaled_price_in_30_days

    # Store the scalers for the ticker
    scalers[ticker] = {
        'scaler_close': scaler_close,
        'scaler_future_price': scaler_future_price
    }

    # Concat the ticker data with the main data
    data = pd.concat([data, ticker_data], ignore_index=True)

Downloading data: 100%|██████████| 503/503 [01:10<00:00,  7.10ticker/s]


In [3]:
# Prepare the data for the model

# Initialize scalers
scaler_ticker = StandardScaler()

# Scale the ticker column
stock_ticker = data.filter(["ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.fit_transform(stock_ticker)
data['ticker'] = scaled_ticker

# Group the data by ticker
grouped_dfs = data.groupby('ticker')


In [4]:
# Create a sliding window for our stock (60 days in past to predict 30 days in future)
x_train, y_train = [], []
for ticker, df in tqdm(grouped_dfs, desc= "Creating sliding windows", unit="ticker"):
    # Sort df by date
    df.sort_values(by='date', inplace=True)

    # Loop through the DataFrame to create sliding windows
    for i in range(60, len(df) - 30):

        # Get the past 60 close prices
        close_prices = df.iloc[i - 60:i]['close'].values.reshape(-1, 1)  # shape (60, 1)

        # Repeat the ticker value for each timestep
        ticker_feature = np.full((60, 1), ticker)  # shape (60, 1)

        # Combine features: shape will be (60, 2)
        features = np.hstack((close_prices, ticker_feature))

        # Append the full (60, 2) array to x_train
        x_train.append(features)

        # Append the 'price in 30 days' value at the 60th row to y_train
        y_train.append(df.iloc[i]['price in 30 days'])

# Convert x_train and y_train to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)

# Reshape x_train to be 3D for LSTM input
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2]))


Creating sliding windows: 100%|██████████| 503/503 [00:03<00:00, 132.95ticker/s]


In [5]:
# Build the Model
model = keras.models.Sequential()

model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Dropout(0.3))

model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.3))

model.add(Dense(128, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))

model.add(Dense(64, kernel_regularizer=regularizers.l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))

model.add(Dense(1))

model.summary()
model.compile(optimizer="adam",
              loss="mae",
              metrics=[keras.metrics.RootMeanSquaredError()])


early_stopping = EarlyStopping(
    monitor='val_loss',        # You can also use 'val_root_mean_squared_error' if you're tracking that
    patience=5,                # Wait for 5 epochs with no improvement
    restore_best_weights=True # Revert to the best weights after stopping
)

training = model.fit(
    x_train, y_train,
    epochs=200,                # Max number of epochs
    batch_size=8,
    validation_split=0.1,      # Use part of training data for validation
    callbacks=[early_stopping]
)
model.save("model1.keras")

2025-04-07 17:36:31.495580: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-04-07 17:36:31.495737: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-04-07 17:36:31.495746: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1744061791.496173   11630 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1744061791.496219   11630 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super().__init__(**kwargs)


Epoch 1/200


2025-04-07 17:36:32.453519: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m476s[0m 64ms/step - loss: 0.8191 - root_mean_squared_error: 0.8559 - val_loss: 0.6422 - val_root_mean_squared_error: 0.7867
Epoch 2/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m553s[0m 75ms/step - loss: 0.6385 - root_mean_squared_error: 0.7796 - val_loss: 0.6327 - val_root_mean_squared_error: 0.7808
Epoch 3/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m516s[0m 70ms/step - loss: 0.6262 - root_mean_squared_error: 0.7687 - val_loss: 0.6426 - val_root_mean_squared_error: 0.8028
Epoch 4/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 65ms/step - loss: 0.6305 - root_mean_squared_error: 0.7773 - val_loss: 0.6408 - val_root_mean_squared_error: 0.8085
Epoch 5/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m465s[0m 63ms/step - loss: 0.6205 - root_mean_squared_error: 0.7655 - val_loss: 0.6297 - val_root_mean_squared_error: 0.7842
Epoch 6/200
[1m7406/7