In [None]:
import pandas as pd
from models.utils.sp_scraper import scrape_sp500_symbols
import yfinance as yf
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
import talib as ta
import time
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib



model = keras.models.load_model('/Users/aryanhazra/Downloads/VSCode Repos/trading_model/src/models/model3/model3.keras')


In [None]:
# Replace '.' with '-' in ticker symbols, also add SPY as a benchmark
sp_tickers = [ticker.replace(".", "-") for ticker in sorted(scrape_sp500_symbols())] + ["^GSPC"]

scalers = {}
data_frames = []

def process_ticker(ticker):
    try:
        # Retry logic
        while True:
            try:
                ticker_data = yf.Ticker(ticker).history(period="max")
                print(ticker_data.tail())
                break
            except Exception:
                time.sleep(10)

        if ticker_data.empty:
            return None, None

        # Process data
        ticker_data.reset_index(inplace=True)
        ticker_data.columns = ticker_data.columns.str.lower()
        ticker_data['ticker'] = ticker
        ticker_data['log_return_30d'] = np.log(ticker_data['close'].shift(-30) / ticker_data['close'])

        ticker_data['rsi'] = ta.RSI(ticker_data['close'], timeperiod=14)
        macd, macdsignal, macdhist = ta.MACD(ticker_data['close'], fastperiod=12, slowperiod=26, signalperiod=9)
        ticker_data['macd'] = macd
        ticker_data['sma_10'] = ta.SMA(ticker_data['close'], timeperiod=10)
        ticker_data['sma_30'] = ta.SMA(ticker_data['close'], timeperiod=30)

        # Initialize scalers
        scaler_close = StandardScaler()
        scaler_future_price = StandardScaler()
        scaler_sma_10 = StandardScaler()
        scaler_sma_30 = StandardScaler()

        # Select and scale
        close_vals = ticker_data[['close']].values
        log_return_vals = ticker_data[['log_return_30d']].values
        sma_10_vals = ticker_data[['sma_10']].values
        sma_30_vals = ticker_data[['sma_30']].values

        ticker_data['scaled_close'] = scaler_close.fit_transform(close_vals)
        ticker_data['scaled_log_return_30d'] = scaler_future_price.fit_transform(log_return_vals)
        ticker_data['scaled_sma_10'] = scaler_sma_10.fit_transform(sma_10_vals)
        ticker_data['scaled_sma_30'] = scaler_sma_30.fit_transform(sma_30_vals)

        # Save scalers
        ticker_scalers = {
            'scaler_close': scaler_close,
            'scaler_future_price': scaler_future_price,
            'scaler_sma_10': scaler_sma_10,
            'scaler_sma_30': scaler_sma_30
        }

        return ticker_data, (ticker, ticker_scalers)
    
    except Exception as e:
        print(f"Failed to process {ticker}: {e}")
        return None, None

# Use multithreading for I/O-bound operations like data download
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_ticker, ticker): ticker for ticker in sp_tickers}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading data", unit="ticker"):
        result_data, result_scalers = future.result()
        if result_data is not None:
            data_frames.append(result_data)
        else:
            print(f"Failed to process {future.result()}")
        if result_scalers is not None:
            ticker, scaler_dict = result_scalers
            scalers[ticker] = scaler_dict
        else:
            print(f"Failed to process {future.result()}")

# Combine all dataframes
data = pd.concat(data_frames, ignore_index=True)
data.dropna(inplace=True)

In [3]:
# Prepare the data for the model
# Label encode the ticker column
label_encoder = LabelEncoder()
data["encoded_ticker"] = label_encoder.fit_transform(data["ticker"])

# Initialize scalers
scaler_ticker = StandardScaler()
scaler_technical = StandardScaler()


# Scale the ticker column
stock_ticker = data.filter(["encoded_ticker"])
stock_ticker = stock_ticker.values
scaled_ticker = scaler_ticker.fit_transform(stock_ticker)
data['scaled_ticker'] = scaled_ticker

#scale technical columns
stock_technical = data.filter(["return", "rsi", "macd"])
stock_technical = stock_technical.values
scaled_technicals = scaler_technical.fit_transform(stock_technical)
# Insert scaled data into the original dataframe
data['scaled_rsi'] = scaled_technicals[:, 0]
data['scaled_macd'] = scaled_technicals[:, 1]

# Group the data by ticker
grouped_dfs = data.groupby('ticker')
grouped_dfs = {ticker: df.sort_values(by='date').reset_index(drop=True) for ticker, df in grouped_dfs}

In [None]:
#take curr day, find most recent day in each df
today = pd.Timestamp.now().strftime('%Y-%m-%d')
for ticker, df in grouped_dfs.items():
    most_recent_day = df['date'].max()
    print(ticker, most_recent_day)
    break