In [None]:
import pandas as pd
from models.utils.sp_scraper import scrape_sp500_symbols
import yfinance as yf
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
import talib as ta
import time
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib
import os

In [None]:
model = keras.models.load_model('/Users/aryanhazra/Downloads/VSCode Repos/trading_model/src/models/model3/model3.keras')

# Load all scalers
scaler_future_price = joblib.load('/Users/aryanhazra/Downloads/VSCode Repos/trading_model/src/models/model3/scaler_future_price.pkl')
scaler_ticker = joblib.load('/Users/aryanhazra/Downloads/VSCode Repos/trading_model/src/models/model3/scaler_ticker.pkl')
scaler_technical = joblib.load('/Users/aryanhazra/Downloads/VSCode Repos/trading_model/src/models/model3/scaler_technical.pkl')

# Initialize the scalers dictionary
scalers = {}

# Path to the scalers directory
scalers_dir = '/Users/aryanhazra/Downloads/VSCode Repos/trading_model/src/models/model3/scalers'

# Loop through each ticker directory
for ticker in os.listdir(scalers_dir):
    ticker_path = os.path.join(scalers_dir, ticker)
    if os.path.isdir(ticker_path):
        # Load all three scalers for this ticker
        scaler_close = joblib.load(os.path.join(ticker_path, 'scaler_close.pkl'))
        scaler_sma_10 = joblib.load(os.path.join(ticker_path, 'scaler_sma_10.pkl'))
        scaler_sma_30 = joblib.load(os.path.join(ticker_path, 'scaler_sma_30.pkl'))
        
        # Store them in a dictionary
        scalers[ticker] = {
            'scaler_close': scaler_close,
            'scaler_sma_10': scaler_sma_10,
            'scaler_sma_30': scaler_sma_30
        }

In [None]:
# Replace '.' with '-' in ticker symbols, also add SPY as a benchmark
sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scrape_sp500_symbols())] + ["^GSPC"]
data_frames = []

def process_ticker(ticker):
    try:
        # Retry logic
        while True:
            try:
                ticker_data = yf.Ticker(ticker).history(period="max")
                break
            except Exception:
                time.sleep(10)

        if ticker_data.empty:
            return None

        # Process data
        ticker_data.reset_index(inplace=True)
        ticker_data.columns = ticker_data.columns.str.lower()

        ticker_data['ticker'] = ticker
        ticker_data['log_return_30d'] = np.log(ticker_data['close'].shift(-30) / ticker_data['close'])

        ticker_data['rsi'] = ta.RSI(ticker_data['close'], timeperiod=14)
        macd, macdsignal, macdhist = ta.MACD(ticker_data['close'], fastperiod=12, slowperiod=26, signalperiod=9)
        ticker_data['macd'] = macd
        ticker_data['sma_10'] = ta.SMA(ticker_data['close'], timeperiod=10)
        ticker_data['sma_30'] = ta.SMA(ticker_data['close'], timeperiod=30)

        # Select and scale
        close_vals = ticker_data[['close']].values
        sma_10_vals = ticker_data[['sma_10']].values
        sma_30_vals = ticker_data[['sma_30']].values

        ticker_data['scaled_close'] = scalers[ticker]['scaler_close'].transform(close_vals)
        ticker_data['scaled_sma_10'] = scalers[ticker]['scaler_sma_10'].transform(sma_10_vals)
        ticker_data['scaled_sma_30'] = scalers[ticker]['scaler_sma_30'].transform(sma_30_vals)

        return ticker_data
    
    except Exception as e:
        print(f"Failed to process {ticker}: {e}")
        return None

# Use multithreading for I/O-bound operations like data download
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_ticker, ticker): ticker for ticker in sp_tickers}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading data", unit="ticker"):
        result_data = future.result()
        if result_data is not None:
            data_frames.append(result_data)
        else:
            print(f"Failed to process {future.result()}")

# Combine all dataframes
data = pd.concat(data_frames, ignore_index=True)
data.dropna(inplace=True)

In [None]:
# Prepare the data for the model
# Label encode the ticker column
label_encoder = LabelEncoder()
data["encoded_ticker"] = label_encoder.fit_transform(data["ticker"])

# Scale future price
log_return_vals = data[['log_return_30d']].values
data['scaled_log_return_30d'] = scaler_future_price.transform(log_return_vals)

# Scale the ticker column
stock_ticker = data.filter(["encoded_ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.transform(stock_ticker)
data['scaled_ticker'] = scaled_ticker

#scale technical columns
stock_technical = data.filter(["return", "rsi", "macd"])
stock_technical = stock_technical.values
scaled_technicals = scaler_technical.transform(stock_technical)
# Insert scaled data into the original dataframe
data['scaled_rsi'] = scaled_technicals[:, 0]
data['scaled_macd'] = scaled_technicals[:, 1]

# Group the data by ticker
grouped_dfs = data.groupby('ticker')
grouped_dfs = {ticker: df.sort_values(by='date').reset_index(drop=True) for ticker, df in grouped_dfs}

In [None]:
# Assuming your list of tuples is called ticker_df_list
spy_data = next(df for ticker, df in grouped_dfs.items() if ticker == "^GSPC")

In [None]:
feature_cols = ['scaled_close', 'scaled_rsi', 'scaled_macd', 'scaled_sma_10', 'scaled_sma_30', 'scaled_ticker']
# Start on day 60 of SPY
# Initalize compound returns
predicted_compound_return = 1.0  # Starting with 1 (100%)
real_compound_return = 1.0
spy_real_compound_return = 1.0

# Have start date for backtesting
start_date = pd.to_datetime('2000-01-01').tz_localize(None)  # Make start_date timezone-naive
spy_data['date'] = pd.to_datetime(spy_data['date']).dt.tz_localize(None)  # Make spy_data dates timezone-naive
start_idx = spy_data['date'].sub(start_date).abs().idxmin()

for i in tqdm(range(start_idx, len(spy_data) - 30, 30), desc="Processing SPY data", unit="step"):
    target_date = spy_data.iloc[i]['date']
    x_test = []
    y_real = []
    predictions = {}
    for ticker, df in grouped_dfs.items():
        # Ensure target_date is timezone-naive
        target_date = pd.to_datetime(target_date).replace(tzinfo=None)

        # Ensure df['date'] is also timezone-naive
        df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None)

        # Now this check will work correctly
        if target_date not in df['date'].values:
            continue

        date_idx = df.index[df['date'] == target_date][0]

        # Check if there are 60 days before and 30 after
        if not (date_idx >= 60 and date_idx + 30 < len(df)):
            continue

        window = df.iloc[date_idx - 60:date_idx][feature_cols].values  # shape (60, num_features)

        x_test.append([ticker, window])

        # Predict the "price in 30 days" from the current i-th index (i.e. day 60 of the window)
        y_real.append(df.iloc[date_idx]['log_return_30d'])

    #predicting based on the second vals of x_test (the windows)
    predictions = model.predict(np.array([x[1] for x in x_test]))
    #inverting the scaling
    predictions = scaler_future_price.inverse_transform(predictions)
    # Assigning predictions, and real vals based on ticker
    predictions = {x_test[i][0]: (float(predictions[i][0]), float(y_real[i])) for i in range(len(predictions))}
    #sorting the predictions by the first value (the predicted vals)
    top_10_predictions = dict(sorted(predictions.items(), 
                                        key=lambda x: x[1][0], 
                                        reverse=True)[:10])
    clear_output(wait=True)  # The wait=True parameter prevents flickering
    print(f"\nPredictions for date: {target_date}")
    print("Top 10 Predicted Returns:")
    print("Ticker | Predicted Return | Actual Return")
    print("-" * 45)
    for ticker, (pred, actual) in top_10_predictions.items():
        # Convert to percentages by multiplying by 100
        print(f"{ticker:6} | {pred*100:13.2f}% | {actual*100:12.2f}%")

    # Convert average returns to percentages
    avg_predicted_return = np.mean([pred for _, (pred, _) in top_10_predictions.items()])
    print("\nAverage Predicted Return for Top 10: {:.2f}%".format(avg_predicted_return*100))

    avg_real_return = np.mean([actual for _, (_, actual) in top_10_predictions.items()])
    print("\nAverage Actual Return for Top 10: {:.2f}%".format(avg_real_return*100))

    # For compound returns, we'll show the total percentage gain/loss
    predicted_compound_return *= np.exp(avg_predicted_return)
    print("Predicted Compound Return: {:.2f}%".format((predicted_compound_return-1)*100))

    real_compound_return *= np.exp(avg_real_return)
    print("Real Compound Return: {:.2f}%".format((real_compound_return-1)*100))

    spy_real_compound_return *= np.exp(spy_data.iloc[i]['log_return_30d'])
    print("SPY Real Compound Return: {:.2f}%".format((spy_real_compound_return-1)*100))


