In [None]:
!pip install yfinance
!pip install tensorflow
url = 'https://anaconda.org/conda-forge/libta-lib/0.4.0/download/linux-64/libta-lib-0.4.0-h166bdaf_1.tar.bz2'
!curl -L $url | tar xj -C /usr/lib/x86_64-linux-gnu/ lib --strip-components=1
!pip install conda-package-handling
!wget https://anaconda.org/conda-forge/ta-lib/0.5.1/download/linux-64/ta-lib-0.5.1-py311h9ecbd09_0.conda
!cph x ta-lib-0.5.1-py311h9ecbd09_0.conda
!mv ./ta-lib-0.5.1-py311h9ecbd09_0/lib/python3.11/site-packages/talib /usr/local/lib/python3.11/dist-packages/

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
import pandas as pd
from sp_scraper import SPScraper

import yfinance as yf
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, LeakyReLU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import talib as ta
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm.notebook import tqdm
import multiprocessing
import json
import talib

In [None]:
# Replace '.' with '-' in ticker symbols, also add SPY as a benchmark
scraper = SPScraper()

sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scraper.scrape_sp500_symbols().index)] + ["^GSPC"]
scalers = {}
data_frames = []

def process_ticker(ticker):
    try:
        # Retry logic
        while True:
            try:
                ticker_data = yf.Ticker(ticker).history(period="5y")
                break
            except Exception:
                time.sleep(10)

        if ticker_data.empty:
            print(f"{ticker} is empty")
            return

        # Process data
        ticker_data.reset_index(inplace=True)
        ticker_data.columns = ticker_data.columns.str.lower()
        ticker_data['ticker'] = ticker


        # Scale per ticker
        ticker_data["macd"], ticker_data["macdsignal"], ticker_data["macdhist"] = ta.MACD(ticker_data['close'], fastperiod=12, slowperiod=26, signalperiod=9)
        ticker_data['sma_10'] = ta.SMA(ticker_data['close'], timeperiod=10)
        ticker_data['sma_30'] = ta.SMA(ticker_data['close'], timeperiod=30)

        # Scale per df
        ticker_data['rsi'] = ta.RSI(ticker_data['close'], timeperiod=14)
        ticker_data['log_return_30d'] = np.log(ticker_data['close'].shift(-30) / ticker_data['close'])
        ticker_data['cdl2crows'] = ta.CDL2CROWS(ticker_data['open'], ticker_data['high'], ticker_data['low'], ticker_data['close'])

        # Initialize scalers
        scaler_close = StandardScaler()
        scaler_macd = StandardScaler()
        scaler_sma_10 = StandardScaler()
        scaler_sma_30 = StandardScaler()

        # Select and scale
        close_vals = ticker_data[['close']].values
        macd_vals = ticker_data[['macd', 'macdsignal', 'macdhist']].values
        sma_10_vals = ticker_data[['sma_10']].values
        sma_30_vals = ticker_data[['sma_30']].values

        ticker_data['scaled_close'] = scaler_close.fit_transform(close_vals)
        scaled_macd_values = scaler_macd.fit_transform(macd_vals)
        ticker_data['scaled_sma_10'] = scaler_sma_10.fit_transform(sma_10_vals)
        ticker_data['scaled_sma_30'] = scaler_sma_30.fit_transform(sma_30_vals)

        # Unpack necessary scaled vals
        ticker_data['scaled_macd'] = scaled_macd_values[:, 0]
        ticker_data['scaled_macdsignal'] = scaled_macd_values[:, 1]
        ticker_data['scaled_macdhist'] = scaled_macd_values[:, 2]

        # Save scalers
        ticker_scalers = {
            'scaler_close': scaler_close,
            'scaler_macd': scaler_macd,
            'scaler_sma_10': scaler_sma_10,
            'scaler_sma_30': scaler_sma_30
        }


        return ticker_data, (ticker, ticker_scalers)

    except Exception as e:
        print(f"Issue applying technical indicators to {ticker}: {e}")
        return

# Use multithreading for I/O-bound operations like data download
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_ticker, ticker): ticker for ticker in sp_tickers}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading data", unit="ticker"):
        result_data, result_scalers = future.result()
        if result_data is not None:
            data_frames.append(result_data)
        else:
            print(f"Failed to process {future.result()}")
        if result_scalers is not None:
            ticker, scaler_dict = result_scalers
            scalers[ticker] = scaler_dict
        else:
            print(f"Failed to process {future.result()}")

# After the ThreadPoolExecutor block, add this code:
# Save all scalers
for ticker, scaler_dict in scalers.items():
    # Create ticker-specific directory
    ticker_dir = os.path.join('scalers', ticker)
    os.makedirs(ticker_dir, exist_ok=True)

    # Save each scaler in the ticker's directory
    for scaler_name, scaler in scaler_dict.items():
        joblib.dump(scaler, os.path.join(ticker_dir, f'{scaler_name}.pkl'))

# Combine all dataframes
data = pd.concat(data_frames, ignore_index=True)
data.dropna(inplace=True)

In [None]:
# Prepare the data for the model
# Label encode the ticker column
label_encoder = LabelEncoder()
data["encoded_ticker"] = label_encoder.fit_transform(data["ticker"])

# Initialize scalers
scaler_ticker = StandardScaler()
scaler_technical = StandardScaler()
scaler_future_price = StandardScaler()

# Collect values
log_return_vals = data[['log_return_30d']].values
stock_ticker = data[['encoded_ticker']].values
stock_technical = data.filter(["rsi", "cdl2crows"]).values

# Scale values
data['scaled_log_return_30d'] = scaler_future_price.fit_transform(log_return_vals)
data['scaled_ticker'] = scaler_ticker.fit_transform(stock_ticker)
scaled_technical = scaler_technical.fit_transform(stock_technical)


# Unpack necessary scaled vals
data['scaled_rsi'] = scaled_technical[:, 0]
data['scaled_cdl2crows'] = scaled_technical[:, 1]

# Group the data by ticker
grouped_dfs = data.groupby('ticker')
grouped_dfs = {ticker: df.sort_values(by='date').reset_index(drop=True) for ticker, df in grouped_dfs}

# Save scalers
joblib.dump(scaler_ticker, "scaler_ticker.pkl")
joblib.dump(scaler_technical, "scaler_technical.pkl")
joblib.dump(scaler_future_price, "scaler_future_price.pkl")

In [None]:
# Your features
feature_cols = ['scaled_close', 'scaled_rsi', 'scaled_macd', 'scaled_sma_10', 'scaled_sma_30',
                'scaled_ticker', 'scaled_macdsignal', 'scaled_macdhist']

# Function for a single ticker
def process_ticker(ticker_df_pair):
    ticker, df = ticker_df_pair
    x_local, y_local = [], []

    if len(df) < 91:
        return x_local, y_local

    for i in range(60, len(df) - 30):
        window = df.iloc[i - 60:i][feature_cols].values
        x_local.append(window)
        y_local.append(df.iloc[i]['scaled_log_return_30d'])

    return x_local, y_local

# Needed to prevent issues with multiprocessing in Colab
def run_parallel_windowing(grouped_dfs):
    x_train, y_train = [], []

    with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        futures = [executor.submit(process_ticker, item) for item in grouped_dfs.items()]
        for f in tqdm(as_completed(futures), total=len(futures), desc="Processing tickers"):
            x_result, y_result = f.result()
            x_train.extend(x_result)
            y_train.extend(y_result)

    return np.array(x_train), np.array(y_train)

# Call it like this:
x_train, y_train = run_parallel_windowing(grouped_dfs)

In [None]:
with tf.device('/device:GPU:0'):

  model = Sequential()

  # Convolutional layers for local pattern extraction
  model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', input_shape=(x_train.shape[1], x_train.shape[2])))
  model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
  model.add(MaxPooling1D(pool_size=2))

  # Stacked Bidirectional LSTM for capturing sequence relationships
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Dropout(0.3))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dropout(0.3))

  # Dense layers for final nonlinear transformation
  model.add(Dense(128, kernel_regularizer=regularizers.l2(0.001)))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha=0.1))
  model.add(Dropout(0.4))  # Slightly increased dropout to reduce overfitting

  model.add(Dense(64, kernel_regularizer=regularizers.l2(0.001)))
  model.add(LeakyReLU(alpha=0.1))
  model.add(Dropout(0.3))

  model.add(Dense(1))  # Output log return prediction

  model.summary()
  model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
      loss=keras.losses.Huber(delta=1.0),  # Huber = better for stability on noisy targets
      metrics=[keras.metrics.RootMeanSquaredError()]
  )


  lr_schedule = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                  factor=0.5,
                                                  patience=3,
                                                  verbose=1)

  early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                            patience=10,
                                            restore_best_weights=True)

  training = model.fit(
      x_train, y_train,
      epochs=200,                # Max number of epochs
      batch_size=128,
      validation_split=0.1,      # Use part of training data for validation
      callbacks=[early_stopping]
  )
  model.save("model4-5.keras")