In [1]:
import pandas as pd
from models.utils.sp_scraper import SPScraper
import yfinance as yf
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, LeakyReLU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import talib as ta
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib
import os


In [2]:
# Replace '.' with '-' in ticker symbols, also add SPY as a benchmark
scraper = SPScraper()

sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scraper.scrape_sp500_symbols().index)] + ["^GSPC"]
scalers = {}
data_frames = []

def process_ticker(ticker):
    try:
        # Retry logic
        while True:
            try:
                ticker_data = yf.Ticker(ticker).history(period="1y")
                break
            except Exception:
                time.sleep(10)

        if ticker_data.empty:
            return None, None

        # Process data
        ticker_data.reset_index(inplace=True)
        ticker_data.columns = ticker_data.columns.str.lower()
        ticker_data['ticker'] = ticker
        ticker_data['log_return_30d'] = np.log(ticker_data['close'].shift(-30) / ticker_data['close'])

        ticker_data['rsi'] = ta.RSI(ticker_data['close'], timeperiod=14)
        macd, macdsignal, macdhist = ta.MACD(ticker_data['close'], fastperiod=12, slowperiod=26, signalperiod=9)
        ticker_data['macd'] = macd
        ticker_data['sma_10'] = ta.SMA(ticker_data['close'], timeperiod=10)
        ticker_data['sma_30'] = ta.SMA(ticker_data['close'], timeperiod=30)

        # Initialize scalers
        scaler_close = StandardScaler()
        scaler_sma_10 = StandardScaler()
        scaler_sma_30 = StandardScaler()

        # Select and scale
        close_vals = ticker_data[['close']].values
        sma_10_vals = ticker_data[['sma_10']].values
        sma_30_vals = ticker_data[['sma_30']].values

        ticker_data['scaled_close'] = scaler_close.fit_transform(close_vals)
        ticker_data['scaled_sma_10'] = scaler_sma_10.fit_transform(sma_10_vals)
        ticker_data['scaled_sma_30'] = scaler_sma_30.fit_transform(sma_30_vals)

        # Save scalers
        ticker_scalers = {
            'scaler_close': scaler_close,
            'scaler_sma_10': scaler_sma_10,
            'scaler_sma_30': scaler_sma_30
        }
        

        return ticker_data, (ticker, ticker_scalers)
    
    except Exception as e:
        print(f"Failed to process {ticker}: {e}")
        return None, None

# Use multithreading for I/O-bound operations like data download
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_ticker, ticker): ticker for ticker in sp_tickers}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading data", unit="ticker"):
        result_data, result_scalers = future.result()
        if result_data is not None:
            data_frames.append(result_data)
        else:
            print(f"Failed to process {future.result()}")
        if result_scalers is not None:
            ticker, scaler_dict = result_scalers
            scalers[ticker] = scaler_dict
        else:
            print(f"Failed to process {future.result()}")

# After the ThreadPoolExecutor block, add this code:
# Save all scalers
for ticker, scaler_dict in scalers.items():
    # Create ticker-specific directory
    ticker_dir = os.path.join('scalers', ticker)
    os.makedirs(ticker_dir, exist_ok=True)
    
    # Save each scaler in the ticker's directory
    for scaler_name, scaler in scaler_dict.items():
        joblib.dump(scaler, os.path.join(ticker_dir, f'{scaler_name}.pkl'))

# Combine all dataframes
data = pd.concat(data_frames, ignore_index=True)
data.dropna(inplace=True)

Downloading data: 100%|██████████| 504/504 [00:06<00:00, 80.54ticker/s]


In [3]:
# Prepare the data for the model
# Label encode the ticker column
label_encoder = LabelEncoder()
data["encoded_ticker"] = label_encoder.fit_transform(data["ticker"])

# Initialize scalers
scaler_ticker = StandardScaler()
scaler_technical = StandardScaler()
scaler_future_price = StandardScaler()

# Scale future price
log_return_vals = data[['log_return_30d']].values
data['scaled_log_return_30d'] = scaler_future_price.fit_transform(log_return_vals)

# Scale the ticker column
stock_ticker = data.filter(["encoded_ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.fit_transform(stock_ticker)
data['scaled_ticker'] = scaled_ticker

#scale technical columns
stock_technical = data.filter(["return", "rsi", "macd"])
stock_technical = stock_technical.values
scaled_technicals = scaler_technical.fit_transform(stock_technical)
# Insert scaled data into the original dataframe
data['scaled_rsi'] = scaled_technicals[:, 0]
data['scaled_macd'] = scaled_technicals[:, 1]

# Group the data by ticker
grouped_dfs = data.groupby('ticker')
grouped_dfs = {ticker: df.sort_values(by='date').reset_index(drop=True) for ticker, df in grouped_dfs}

# Save scalers
joblib.dump(scaler_ticker, "scaler_ticker.pkl")
joblib.dump(scaler_technical, "scaler_technical.pkl")
joblib.dump(scaler_future_price, "scaler_future_price.pkl")

['scaler_future_price.pkl']

In [4]:
# List the features you want to include (excluding 'price in 30 days' and 'date')
# scaled_close, scaled_sma's are scaled by ticker, rest by total df
feature_cols = ['scaled_close', 'scaled_rsi', 'scaled_macd', 'scaled_sma_10', 'scaled_sma_30', 'scaled_ticker']  # add your actual column names here

x_train, y_train = [], []

for ticker, df in tqdm(grouped_dfs.items(), desc="Creating sliding windows", unit="ticker"):

    if len(df) < 91:
        continue
    
    for i in range(60, len(df) - 30):
        # Extract a sliding window of all desired features
        window = df.iloc[i - 60:i][feature_cols].values  # shape (60, num_features)

        # Optional: add ticker as a numeric value if it's useful
        # ticker_id = your_ticker_encoding[ticker]  # if you're using one-hot or label encoding
        # ticker_column = np.full((60, 1), ticker_id)
        # window = np.hstack((window, ticker_column))

        x_train.append(window)

        # Predict the "price in 30 days" from the current i-th index (i.e. day 60 of the window)
        y_train.append(df.iloc[i]['scaled_log_return_30d'])
        
# Convert to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)

Creating sliding windows: 100%|██████████| 504/504 [00:07<00:00, 67.36ticker/s]


In [5]:
model = Sequential()

# Convolutional layers for local pattern extraction
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Stacked Bidirectional LSTM for capturing sequence relationships
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))

# Dense layers for final nonlinear transformation
model.add(Dense(128, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.4))  # Slightly increased dropout to reduce overfitting

model.add(Dense(64, kernel_regularizer=regularizers.l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))

model.add(Dense(1))  # Output log return prediction

model.summary()
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
    loss=keras.losses.Huber(delta=1.0),  # Huber = better for stability on noisy targets
    metrics=[keras.metrics.RootMeanSquaredError()]
)


lr_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                factor=0.5, 
                                                patience=3, 
                                                verbose=1)

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', 
                                           patience=10, 
                                           restore_best_weights=True)

training = model.fit(
    x_train, y_train,
    epochs=200,                # Max number of epochs
    batch_size=32,
    validation_split=0.1,      # Use part of training data for validation
    callbacks=[early_stopping]
)
model.save("model3.keras")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-04-13 20:26:17.477082: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-04-13 20:26:17.477273: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-04-13 20:26:17.477585: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1744590377.477886  285574 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1744590377.478282  285574 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


2025-04-13 20:26:18.739308: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m1388/1388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 91ms/step - loss: 0.4733 - root_mean_squared_error: 0.8600 - val_loss: 0.2917 - val_root_mean_squared_error: 0.7508
