In [None]:
import pandas as pd
from utils.sp_scraper import scrape_sp500_symbols
import yfinance as yf
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
import tensorflow as tf
import optuna
from optuna.integration import KerasPruningCallback


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Replace '.' with '-' in ticker symbols
sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scrape_sp500_symbols())]
ticker_encoding = {ticker: i for i, ticker in enumerate(sp_tickers)}
data = pd.DataFrame()
# Initialize scaler dictionaries to store scalers for each ticker
scalers = {}
for ticker in tqdm(sp_tickers, desc = "Downloading data", unit="ticker"):
    # Initialize scalers
    scaler_close = StandardScaler()
    scaler_future_price = StandardScaler()


    # Get max data for the ticker
    ticker_data = yf.Ticker(ticker).history(period="max")

    # Make data a column instead of index
    ticker_data.reset_index(inplace=True)

    # Make columns lowercase
    ticker_data.columns = ticker_data.columns.str.lower()

    # Add a price in 30 days column
    ticker_data['price in 30 days'] = ticker_data['close'].shift(-30)

    # Drop NA rows (last 30 days)
    ticker_data.dropna(inplace=True)

    # Replace a ticker column
    ticker_data['ticker'] = ticker_encoding[ticker]

    # Scale close column
    stock_close = ticker_data.filter(["close"])
    # Convert to numpy array
    stock_close = stock_close.values
    # Scale the data
    scaled_close = scaler_close.fit_transform(stock_close)
    # Insert scaled data into the original dataframe
    ticker_data['close'] = scaled_close

    # Scale prediction column
    stock_price_in_30_days = ticker_data.filter(["price in 30 days"])
    # Convert to numpy array
    stock_price_in_30_days = stock_price_in_30_days.values
    # Scale the data
    scaled_price_in_30_days = scaler_future_price.fit_transform(stock_price_in_30_days)
    # Insert scaled data into the original dataframe
    ticker_data['price in 30 days'] = scaled_price_in_30_days

    # Store the scalers for the ticker
    scalers[ticker] = {
        'scaler_close': scaler_close,
        'scaler_future_price': scaler_future_price
    }

    # Concat the ticker data with the main data
    data = pd.concat([data, ticker_data], ignore_index=True)

Downloading data: 100%|██████████| 503/503 [02:35<00:00,  3.24ticker/s]


In [3]:
# Prepare the data for the model

# Initialize scalers
scaler_ticker = StandardScaler()

# Scale the ticker column
stock_ticker = data.filter(["ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.fit_transform(stock_ticker)
data['ticker'] = scaled_ticker

# Group the data by ticker
grouped_dfs = data.groupby('ticker')


In [4]:
# Create a sliding window for our stock (60 days in past to predict 30 days in future)
x_train, y_train = [], []
for ticker, df in tqdm(grouped_dfs, desc= "Creating sliding windows", unit="ticker"):
    # Sort df by date
    df.sort_values(by='date', inplace=True)

    # Loop through the DataFrame to create sliding windows
    for i in range(60, len(df) - 30):

        # Get the past 60 close prices
        close_prices = df.iloc[i - 60:i]['close'].values.reshape(-1, 1)  # shape (60, 1)

        # Repeat the ticker value for each timestep
        ticker_feature = np.full((60, 1), ticker)  # shape (60, 1)

        # Combine features: shape will be (60, 2)
        features = np.hstack((close_prices, ticker_feature))

        # Append the full (60, 2) array to x_train
        x_train.append(features)

        # Append the 'price in 30 days' value at the 60th row to y_train
        y_train.append(df.iloc[i]['price in 30 days'])

# Convert x_train and y_train to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)

# Reshape x_train to be 3D for LSTM input
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2]))


Creating sliding windows: 100%|██████████| 503/503 [03:43<00:00,  2.25ticker/s]


In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    params = {
        "lstm_units_1": trial.suggest_int('lstm_units_1', 64, 512, step=64),
        "lstm_units_2": trial.suggest_int('lstm_units_2', 64, 512, step=64),
        "dense_units": trial.suggest_int('dense_units', 64, 256, step=64),
        "dropout_rate": trial.suggest_float('dropout_rate', 0.2, 0.7),
        "learning_rate": trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
        "optimizer": trial.suggest_categorical('optimizer', ['adam', 'rmsprop']),
    }



    # Build the model
    model = keras.models.Sequential()
    model.add(keras.layers.LSTM(params["lstm_units_1"], return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
    model.add(keras.layers.LSTM(params["lstm_units_2"], return_sequences=False))
    model.add(keras.layers.Dense(params["dense_units"], activation="relu"))
    model.add(keras.layers.Dropout(params["dropout_rate"]))
    model.add(keras.layers.Dense(1))

    # Compile model
    optimizer = keras.optimizers.Adam(learning_rate=params["learning_rate"]) if params["optimizer"] == 'adam' else keras.optimizers.RMSprop(learning_rate=params["learning_rate"])
    model.compile(optimizer=optimizer, loss="mae", metrics=[keras.metrics.RootMeanSquaredError()])

    # Early stopping callback
    pruning_callback = KerasPruningCallback(trial, 'val_loss')


    # Train the model
    history = model.fit(
        x_train, y_train,
        validation_split=0.2,
        epochs=10,
        batch_size=8,
        callbacks=[pruning_callback],  # Add pruning callback here
        verbose=1
    )
    history = model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=8, callbacks=[pruning_callback], verbose=1)

    return min(history.history['val_loss'])

# Create an Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Store best hyperparameters in a dictionary
best_hyperparams = study.best_params

[I 2025-04-04 00:17:51,164] A new study created in memory with name: no-name-79889b2b-b144-4d88-9619-f4dea7dab06a
2025-04-04 00:17:51.315206: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-04-04 00:17:51.315868: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-04-04 00:17:51.316439: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1743740271.316743  126383 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1743740271.317113  126383 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super().__init__(**kwargs)


Epoch 1/10


2025-04-04 00:17:59.765522: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m250534/423532[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m53:37[0m 19ms/step - loss: 0.1341 - root_mean_squared_error: 0.2265

In [None]:
# Store best hyperparameters in a dictionary
best_hyperparams = study.best_params

# Build model
model = keras.models.Sequential()
model.add(keras.layers.LSTM(best_hyperparams["lstm_units_1"], return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(keras.layers.LSTM(best_hyperparams["lstm_units_2"], return_sequences=False))
model.add(keras.layers.Dense(best_hyperparams["dense_units"], activation="relu"))
model.add(keras.layers.Dropout(best_hyperparams["dropout_rate"]))
model.add(keras.layers.Dense(1))

# Compile model
optimizer = keras.optimizers.Adam(learning_rate=best_hyperparams["learning_rate"]) if best_hyperparams["optimizer"] == 'adam' else keras.optimizers.RMSprop(learning_rate=best_hyperparams["learning_rate"])
model.compile(optimizer=optimizer, loss="mae", metrics=[keras.metrics.RootMeanSquaredError()])

# Train the final model with the best hyperparameters
early_stopping = keras.callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

# Train the model with as many epochs as possible
training = model.fit(x_train, y_train, epochs=1000, batch_size=8, callbacks=[early_stopping])

model.save("model1.keras")