In [None]:
import pandas as pd
from models.utils.sp_scraper import scrape_sp500_symbols
import yfinance as yf
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, LeakyReLU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential, load_model
import talib as ta
import time

model = keras.models.load_model('/Users/aryanhazra/Downloads/VSCode Repos/trading_model/src/models/model3/model3.keras')


In [None]:
# Replace '.' with '-' in ticker symbols, also add SPY as a benchmark
sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scrape_sp500_symbols())] + ["^GSPC"]
data = pd.DataFrame()

# Initialize scaler dictionaries to store scalers for each ticker
scalers = {}

for ticker in tqdm(sp_tickers, desc = "Downloading data", unit="ticker"):
    # Initialize scalers
    scaler_close = StandardScaler()
    scaler_future_price = StandardScaler()
    scaler_sma_10 = StandardScaler()
    scaler_sma_30 = StandardScaler()


    def get_ticker_data(ticker):
        while True:
            try:
                # Get max data for the ticker
                ticker_data = yf.Ticker(ticker).history(period="max")
                return ticker_data
            except Exception as e:
                time.sleep(10)
                # Continue the loop to try again

    # Usage - this will keep trying until successful
    ticker_data = get_ticker_data(ticker)

    # Make date a column instead of index
    ticker_data.reset_index(inplace=True)

    # Make columns lowercase
    ticker_data.columns = ticker_data.columns.str.lower()

    # Add a ticker column keep uppercase
    ticker_data['ticker'] = ticker

    # Add a price return in 30 days column
    ticker_data['log_return_30d'] = np.log(ticker_data['close'].shift(-30) / ticker_data['close'])

    # Calculate RSI (Relative Strength Index)
    ticker_data['rsi'] = ta.RSI(ticker_data['close'], timeperiod=14)

    # Calculate MACD (Moving Average Convergence Divergence)
    macd, macdsignal, macdhist = ta.MACD(ticker_data['close'], fastperiod=12, slowperiod=26, signalperiod=9)
    ticker_data['macd'] = macd  # MACD line

    # Calculate SMA (Simple Moving Average) for 10 and 30 periods
    ticker_data['sma_10'] = ta.SMA(ticker_data['close'], timeperiod=10)
    ticker_data['sma_30'] = ta.SMA(ticker_data['close'], timeperiod=30)

    # Scale close column
    stock_close = ticker_data.filter(["close"])
    stock_log_return_30d = ticker_data.filter(["log_return_30d"])
    stock_sma_10 = ticker_data.filter(["sma_10"])   
    stock_sma_30 = ticker_data.filter(["sma_30"])
    # Convert to numpy array
    stock_close = stock_close.values
    stock_log_return_30d = stock_log_return_30d.values
    stock_sma_10 = stock_sma_10.values
    stock_sma_30 = stock_sma_30.values
    # Scale the data
    scaled_close = scaler_close.fit_transform(stock_close)
    scaled_log_return_30d = scaler_future_price.fit_transform(stock_log_return_30d)
    scaled_sma_10 = scaler_sma_10.fit_transform(stock_sma_10)
    scaled_sma_30 = scaler_sma_30.fit_transform(stock_sma_30)
    # Insert scaled data into the original dataframe
    ticker_data['scaled_close'] = scaled_close
    ticker_data['scaled_log_return_30d'] = scaled_log_return_30d
    ticker_data['scaled_sma_10'] = scaled_sma_10
    ticker_data['scaled_sma_30'] = scaled_sma_30

    # Store the scalers for the ticker
    scalers[ticker] = {
        'scaler_close': scaler_close,
        'scaler_future_price': scaler_future_price,
    }

    # Concat the ticker data with the main data
    data = pd.concat([data, ticker_data], ignore_index=True)

data.dropna(inplace=True)

In [3]:
# Prepare the data for the model
# Label encode the ticker column
label_encoder = LabelEncoder()
data["encoded_ticker"] = label_encoder.fit_transform(data["ticker"])

# Initialize scalers
scaler_ticker = StandardScaler()
scaler_technical = StandardScaler()


# Scale the ticker column
stock_ticker = data.filter(["encoded_ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.fit_transform(stock_ticker)
data['scaled_ticker'] = scaled_ticker

#scale technical columns
stock_technical = data.filter(["return", "rsi", "macd"])
stock_technical = stock_technical.values
scaled_technicals = scaler_technical.fit_transform(stock_technical)
# Insert scaled data into the original dataframe
data['scaled_rsi'] = scaled_technicals[:, 0]
data['scaled_macd'] = scaled_technicals[:, 1]

# Group the data by ticker
grouped_dfs = data.groupby('ticker')

In [4]:
# Assuming your list of tuples is called ticker_df_list
spy_data = next(df for ticker, df in grouped_dfs if ticker == "^GSPC")

In [None]:
feature_cols = ['scaled_close', 'scaled_rsi', 'scaled_macd', 'scaled_sma_10', 'scaled_sma_30', 'scaled_ticker']
# Start on day 60 of SPY
for i in tqdm(range(60, len(spy_data) - 30, 30), desc="Processing SPY data", unit="step"):
    target_date = spy_data.iloc[i]['date']
    x_test = []
    y_real = []
    predictions = {}
    for ticker, df in grouped_dfs:
        if ticker == "^GSPC":
            print("SPY")
        df = df.sort_values(by='date').reset_index(drop=True)
    
        # Ensure target_date is timezone-naive
        target_date = pd.to_datetime(target_date).replace(tzinfo=None)

        # Ensure df['date'] is also timezone-naive
        df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None)

        # Now this check will work correctly
        if target_date not in df['date'].values:
            continue

        date_idx = df.index[df['date'] == target_date][0]

        # Check if there are 60 days before and 30 after
        if not (date_idx >= 60 and date_idx + 30 < len(df)):
            continue

        window = df.iloc[date_idx - 60:date_idx][feature_cols].values  # shape (60, num_features)

        x_test.append([ticker, window])

        # Predict the "price in 30 days" from the current i-th index (i.e. day 60 of the window)
        y_real.append(df.iloc[i]['log_return_30d'])

    #predicting based on the second vals of x_test (the windows)
    predictions = model.predict(np.array([x[1] for x in x_test]))
    #inverting the scaling
    predictions = scalers[x_test[0][0]]['scaler_future_price'].inverse_transform(predictions)
    # Assigning predictions, and real vals based on ticker
    predictions = {x_test[i][0]: (float(predictions[i][0]), float(y_real[i])) for i in range(len(predictions))}
    #sorting the predictions by the first value (the predicted vals)
    top_10_predictions = dict(sorted(predictions.items(), 
                                        key=lambda x: x[1][0], 
                                        reverse=True)[:10])
    print(f"\nPredictions for date: {target_date}")
    print("Top 10 Predicted Returns:")
    print("Ticker | Predicted Return | Actual Return")
    print("-" * 45)
    for ticker, (pred, actual) in top_10_predictions.items():
        print(f"{ticker:6} | {pred:14.4f} | {actual:13.4f}")