In [10]:
import pandas as pd
from utils.sp_scraper import scrape_sp500_symbols
import yfinance as yf
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
import tensorflow as tf
import optuna
from optuna.integration import KerasPruningCallback
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, LeakyReLU, Dropout
import talib as ta
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Bidirectional, LSTM, GRU
from tensorflow.keras.layers import Dense, BatchNormalization, LeakyReLU, Dropout
from tensorflow.keras import regularizers


In [2]:
# Replace '.' with '-' in ticker symbols
sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scrape_sp500_symbols())]
ticker_encoding = {ticker: i for i, ticker in enumerate(sp_tickers)}
data = pd.DataFrame()
# Initialize scaler dictionaries to store scalers for each ticker
scalers = {}
for ticker in tqdm(sp_tickers, desc = "Downloading data", unit="ticker"):
    # Initialize scalers
    scaler_close = StandardScaler()
    scaler_future_price = StandardScaler()
    scaler_technical = StandardScaler()


    # Get max data for the ticker
    def get_ticker_data(ticker):
        # Get max data for the ticker
        ticker_data = yf.Ticker(ticker).history(period="1y")
        return ticker_data

    try:
        # Get ticker data
        ticker_data = get_ticker_data(ticker)
    except:
        time.sleep(10)
        ticker_data = get_ticker_data(ticker)

    # Make date a column instead of index
    ticker_data.reset_index(inplace=True)

    # Make columns lowercase
    ticker_data.columns = ticker_data.columns.str.lower()

    # Add a price return in 30 days column
    ticker_data['log_return_30d'] = np.log(ticker_data['close'].shift(-30) / ticker_data['close'])

    # Assuming ticker_data is a DataFrame with 'Close' column
    ticker_data['return'] = ticker_data['close'].pct_change()  # Percentage change in close prices

    # Calculate RSI (Relative Strength Index)
    ticker_data['rsi'] = ta.RSI(ticker_data['close'], timeperiod=14)

    # Calculate MACD (Moving Average Convergence Divergence)
    macd, macdsignal, macdhist = ta.MACD(ticker_data['close'], fastperiod=12, slowperiod=26, signalperiod=9)
    ticker_data['macd'] = macd  # MACD line

    # Calculate SMA (Simple Moving Average) for 10 and 30 periods
    ticker_data['sma_10'] = ta.SMA(ticker_data['close'], timeperiod=10)
    ticker_data['sma_30'] = ta.SMA(ticker_data['close'], timeperiod=30)

    # Select the relevant technical columns
    stock_technicals = ticker_data[['return', 'rsi', 'macd', 'sma_10', 'sma_30']]

    # Convert to numpy array
    stock_technicals = stock_technicals.values
    # Scale the data
    scaled_technicals = scaler_technical.fit_transform(stock_technicals)
    # Insert scaled data into the original dataframe
    ticker_data['return'] = scaled_technicals[:, 0]
    ticker_data['rsi'] = scaled_technicals[:, 1]
    ticker_data['macd'] = scaled_technicals[:, 2]
    ticker_data['sma_10'] = scaled_technicals[:, 3]
    ticker_data['sma_30'] = scaled_technicals[:, 4]

    # Replace a ticker column
    ticker_data['ticker'] = ticker_encoding[ticker]

    # Scale close column
    stock_close = ticker_data.filter(["close"])
    # Convert to numpy array
    stock_close = stock_close.values
    # Scale the data
    scaled_close = scaler_close.fit_transform(stock_close)
    # Insert scaled data into the original dataframe
    ticker_data['close'] = scaled_close

    stock_log_return_30d = ticker_data.filter(["log_return_30d"])
    stock_log_return_30d = stock_log_return_30d.values
    scaled_log_return_30d = scaler_future_price.fit_transform(stock_log_return_30d)
    ticker_data['log_return_30d'] = scaled_log_return_30d

    # Store the scalers for the ticker
    scalers[ticker] = {
        'scaler_close': scaler_close,
        'scaler_future_price': scaler_future_price,
        'scaler_technical': scaler_technical
    }

    # Concat the ticker data with the main data
    data = pd.concat([data, ticker_data], ignore_index=True)

data.dropna(inplace=True)

Downloading data: 100%|██████████| 503/503 [03:34<00:00,  2.34ticker/s]


In [3]:
# Prepare the data for the model

# Initialize scalers
scaler_ticker = StandardScaler()

# Scale the ticker column
stock_ticker = data.filter(["ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.fit_transform(stock_ticker)
data['ticker'] = scaled_ticker

# Group the data by ticker
grouped_dfs = data.groupby('ticker')


In [4]:
# List the features you want to include (excluding 'price in 30 days' and 'date')
feature_cols = ['close', 'rsi', 'macd', 'sma_10', 'sma_30', 'ticker']  # add your actual column names here

x_train, y_train = [], []

for ticker, df in tqdm(grouped_dfs, desc="Creating sliding windows", unit="ticker"):
    df.sort_values(by='date', inplace=True)
    
    # Ensure no NaNs (especially if you used rolling indicators like SMA, RSI)
    df = df.dropna(subset=feature_cols + ['log_return_30d'])

    for i in range(60, len(df) - 30):
        # Extract a sliding window of all desired features
        window = df.iloc[i - 60:i][feature_cols].values  # shape (60, num_features)

        # Optional: add ticker as a numeric value if it's useful
        # ticker_id = your_ticker_encoding[ticker]  # if you're using one-hot or label encoding
        # ticker_column = np.full((60, 1), ticker_id)
        # window = np.hstack((window, ticker_column))

        x_train.append(window)

        # Predict the "price in 30 days" from the current i-th index (i.e. day 60 of the window)
        y_train.append(df.iloc[i]['log_return_30d'])
        
# Convert to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)

Creating sliding windows: 100%|██████████| 503/503 [00:07<00:00, 70.34ticker/s]


In [None]:
model = Sequential()

# Convolutional layers for local pattern extraction
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Stacked Bidirectional LSTM for capturing sequence relationships
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))

# Dense layers for final nonlinear transformation
model.add(Dense(128, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.4))  # Slightly increased dropout to reduce overfitting

model.add(Dense(64, kernel_regularizer=regularizers.l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))

model.add(Dense(1))  # Output log return prediction

model.summary()
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
    loss=keras.losses.Huber(delta=1.0),  # Huber = better for stability on noisy targets
    metrics=[keras.metrics.RootMeanSquaredError()]
)


lr_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                factor=0.5, 
                                                patience=3, 
                                                verbose=1)

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', 
                                           patience=10, 
                                           restore_best_weights=True)

training = model.fit(
    x_train, y_train,
    epochs=200,                # Max number of epochs
    batch_size=32,
    validation_split=0.1,      # Use part of training data for validation
    callbacks=[early_stopping]
)
model.save("model3.keras")

Epoch 1/200
[1m1385/1385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 90ms/step - loss: 0.5412 - root_mean_squared_error: 0.9220 - val_loss: 0.3571 - val_root_mean_squared_error: 0.7985
Epoch 2/200
[1m1385/1385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 90ms/step - loss: 0.2967 - root_mean_squared_error: 0.7292 - val_loss: 0.2696 - val_root_mean_squared_error: 0.7313
Epoch 3/200
[1m 247/1385[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1:37[0m 86ms/step - loss: 0.2365 - root_mean_squared_error: 0.6827