In [3]:
import pandas as pd
from utils.sp_scraper import scrape_sp500_symbols
import yfinance as yf
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
import tensorflow as tf
import optuna
from optuna.integration import KerasPruningCallback
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, LeakyReLU, Dropout
import talib as ta
import time



In [None]:
# Replace '.' with '-' in ticker symbols
sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scrape_sp500_symbols())]
ticker_encoding = {ticker: i for i, ticker in enumerate(sp_tickers)}
data = pd.DataFrame()
# Initialize scaler dictionaries to store scalers for each ticker
scalers = {}
for ticker in tqdm(sp_tickers, desc = "Downloading data", unit="ticker"):
    # Initialize scalers
    scaler_close = StandardScaler()
    scaler_future_price = StandardScaler()
    scaler_technical = StandardScaler()


    # Get max data for the ticker
    def get_ticker_data(ticker):
        # Get max data for the ticker
        ticker_data = yf.Ticker(ticker).history(period="1y")
        return ticker_data

    try:
        # Get ticker data
        ticker_data = get_ticker_data(ticker)
    except:
        time.sleep(10)
        ticker_data = get_ticker_data(ticker)

    # Make date a column instead of index
    ticker_data.reset_index(inplace=True)

    # Make columns lowercase
    ticker_data.columns = ticker_data.columns.str.lower()

    # Add a price in 30 days column
    ticker_data['price in 30 days'] = ticker_data['close'].shift(-30)

    # Assuming ticker_data is a DataFrame with 'Close' column
    ticker_data['return'] = ticker_data['close'].pct_change()  # Percentage change in close prices

    # Calculate RSI (Relative Strength Index)
    ticker_data['rsi'] = ta.RSI(ticker_data['close'], timeperiod=14)

    # Calculate MACD (Moving Average Convergence Divergence)
    macd, macdsignal, macdhist = ta.MACD(ticker_data['close'], fastperiod=12, slowperiod=26, signalperiod=9)
    ticker_data['macd'] = macd  # MACD line

    # Calculate SMA (Simple Moving Average) for 10 and 30 periods
    ticker_data['sma_10'] = ta.SMA(ticker_data['close'], timeperiod=10)
    ticker_data['sma_30'] = ta.SMA(ticker_data['close'], timeperiod=30)

    # Select the relevant technical columns
    stock_technicals = ticker_data[['return', 'rsi', 'macd', 'sma_10', 'sma_30']]

    # Convert to numpy array
    stock_technicals = stock_technicals.values
    # Scale the data
    scaled_technicals = scaler_technical.fit_transform(stock_technicals)
    # Insert scaled data into the original dataframe
    ticker_data['return'] = scaled_technicals[:, 0]
    ticker_data['rsi'] = scaled_technicals[:, 1]
    ticker_data['macd'] = scaled_technicals[:, 2]
    ticker_data['sma_10'] = scaled_technicals[:, 3]
    ticker_data['sma_30'] = scaled_technicals[:, 4]

    # Replace a ticker column
    ticker_data['ticker'] = ticker_encoding[ticker]

    # Scale close column
    stock_close = ticker_data.filter(["close"])
    # Convert to numpy array
    stock_close = stock_close.values
    # Scale the data
    scaled_close = scaler_close.fit_transform(stock_close)
    # Insert scaled data into the original dataframe
    ticker_data['close'] = scaled_close

    # Scale prediction column
    stock_price_in_30_days = ticker_data.filter(["price in 30 days"])
    # Convert to numpy array
    stock_price_in_30_days = stock_price_in_30_days.values
    # Scale the data
    scaled_price_in_30_days = scaler_future_price.fit_transform(stock_price_in_30_days)
    # Insert scaled data into the original dataframe
    ticker_data['price in 30 days'] = scaled_price_in_30_days

    # Store the scalers for the ticker
    scalers[ticker] = {
        'scaler_close': scaler_close,
        'scaler_future_price': scaler_future_price,
        'scaler_technical': scaler_technical
    }

    # Concat the ticker data with the main data
    data = pd.concat([data, ticker_data], ignore_index=True)

data.dropna(inplace=True)

Downloading data:   0%|          | 0/503 [00:01<?, ?ticker/s]


KeyError: 'close'

In [None]:
# Prepare the data for the model

# Initialize scalers
scaler_ticker = StandardScaler()

# Scale the ticker column
stock_ticker = data.filter(["ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.fit_transform(stock_ticker)
data['ticker'] = scaled_ticker

# Group the data by ticker
grouped_dfs = data.groupby('ticker')


In [None]:
# List the features you want to include (excluding 'price in 30 days' and 'date')
feature_cols = ['close', 'rsi', 'macd', 'sma_10', 'sma_30', 'ticker']  # add your actual column names here

x_train, y_train = [], []

for ticker, df in tqdm(grouped_dfs, desc="Creating sliding windows", unit="ticker"):
    df.sort_values(by='date', inplace=True)
    
    # Ensure no NaNs (especially if you used rolling indicators like SMA, RSI)
    df = df.dropna(subset=feature_cols + ['price in 30 days'])

    for i in range(60, len(df) - 30):
        # Extract a sliding window of all desired features
        window = df.iloc[i - 60:i][feature_cols].values  # shape (60, num_features)

        # Optional: add ticker as a numeric value if it's useful
        # ticker_id = your_ticker_encoding[ticker]  # if you're using one-hot or label encoding
        # ticker_column = np.full((60, 1), ticker_id)
        # window = np.hstack((window, ticker_column))

        x_train.append(window)

        # Predict the "price in 30 days" from the current i-th index (i.e. day 60 of the window)
        y_train.append(df.iloc[i]['price in 30 days'])

# Convert to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)

Creating sliding windows: 100%|██████████| 503/503 [00:03<00:00, 140.14ticker/s]


In [None]:
# Build the Model
model = keras.models.Sequential()

model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Dropout(0.3))

model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.3))

model.add(Dense(128, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))

model.add(Dense(64, kernel_regularizer=regularizers.l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))

model.add(Dense(1))

model.summary()
model.compile(optimizer="adam",
              loss=keras.losses.Huber(),
              metrics=[keras.metrics.RootMeanSquaredError()])


lr_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                factor=0.5, 
                                                patience=3, 
                                                verbose=1)

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', 
                                           patience=10, 
                                           restore_best_weights=True)

training = model.fit(
    x_train, y_train,
    epochs=200,                # Max number of epochs
    batch_size=8,
    validation_split=0.1,      # Use part of training data for validation
    callbacks=[early_stopping]
)
model.save("model1.keras")

2025-04-07 00:21:00.056478: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-04-07 00:21:00.056546: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-04-07 00:21:00.056567: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1743999660.056970  148051 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1743999660.057173  148051 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super().__init__(**kwargs)


Epoch 1/200


2025-04-07 00:21:00.783959: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m365s[0m 49ms/step - loss: 0.7568 - root_mean_squared_error: 0.8119 - val_loss: 0.6326 - val_root_mean_squared_error: 0.7992
Epoch 2/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 49ms/step - loss: 0.6096 - root_mean_squared_error: 0.7646 - val_loss: 0.6320 - val_root_mean_squared_error: 0.7896
Epoch 3/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 49ms/step - loss: 0.5945 - root_mean_squared_error: 0.7503 - val_loss: 0.6243 - val_root_mean_squared_error: 0.7966
Epoch 4/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 49ms/step - loss: 0.5783 - root_mean_squared_error: 0.7321 - val_loss: 0.6011 - val_root_mean_squared_error: 0.7597
Epoch 5/200
[1m7406/7406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 49ms/step - loss: 0.5632 - root_mean_squared_error: 0.7168 - val_loss: 0.6090 - val_root_mean_squared_error: 0.7801
Epoch 6/200
[1m7406/7