In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pandas.tseries.offsets import BDay
import warnings
warnings.filterwarnings("ignore")
import json

import yfinance as yf
from tqdm import tqdm

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Colab Notebooks/Imperial MLDS/DeepTimeSeriesClustering'
os.chdir(path)

Mounted at /content/drive


# Import Config File

In [None]:
params_filename = "params.json"
params_path = os.path.join(path, params_filename)

with open(params_path, 'r') as f:
  config = json.load(f)

# Define key dates
cut_off_date = config['cut_off_date']
data_start = config['data_start']
strat_start = config['strat_start']
strat_end = config['strat_end']

# S&P 500 Historical Constituents

In [None]:
# Import historical constituents
filename = "s&p500_historical_constituents.csv"
file_path = os.path.join(path, filename)
snp500_hist_cons = pd.read_csv(file_path, encoding='utf-8')

# Historical constituent data preprocessing
snp500_hist_cons['tickers'] = snp500_hist_cons['tickers'].str.split(",")
snp500_hist_cons = snp500_hist_cons.explode("tickers")

# Get start and end date for each constituent
start_date = pd.DataFrame(snp500_hist_cons.groupby("tickers")['date'].min()).rename(columns={'date': 'start_date'})
end_date = pd.DataFrame(snp500_hist_cons.groupby("tickers")['date'].max()).rename(columns={'date': 'end_date'})
date_ranges = start_date.merge(end_date, left_index=True, right_index=True)

# Filter date ranges
strat_mask = (date_ranges['start_date'] <= cut_off_date) & (date_ranges['end_date'] >= cut_off_date)
snp500_cons_cut_off = date_ranges[strat_mask]
snp500_cons_cut_off['strat_start'] = data_start
snp500_cons_cut_off['strat_end'] = strat_end
snp500_cons_cut_off = snp500_cons_cut_off.reset_index()

# Functions For Pricing Data Preprocessing

In [None]:
# Get price from yfinance
def get_price(ticker, start, end, interval='1d'):
  try:
    data = yf.download(ticker, start, end, ignore_tz=True,
                       multi_level_index=False, progress=False,
                       auto_adjust=True, interval=interval)
    data.reset_index(inplace=True)
    data['Ticker'] = ticker
    data = data[['Ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
    return data
  except KeyError as e:
    print(ticker, f"failed. {e}")

In [None]:
# Compute RSI
def compute_rsi(df, rsi_window="14d"):
  df_copy = df.copy()
  df_copy['delta'] = df_copy.groupby("Ticker")['Close'].diff()
  df_copy['delta_gain'] = np.where(df_copy['delta'] > 0, df_copy['delta'], 0)
  df_copy['delta_loss'] = np.where(df_copy['delta'] < 0, -df_copy['delta'], 0)
  df_copy['delta_gain'] = df_copy['delta_gain'].fillna(0.0)
  df_copy['delta_loss'] = df_copy['delta_loss'].fillna(0.0)
  gain = df_copy.groupby("Ticker").rolling(window=rsi_window, on="Date")['delta_gain'].mean().reset_index().rename(columns={"delta_gain": "gain"})
  loss = df_copy.groupby("Ticker").rolling(window=rsi_window, on="Date")['delta_loss'].mean().reset_index().rename(columns={"delta_loss": "loss"})
  df_copy = df_copy.merge(gain, on=['Ticker', 'Date'], how='inner')
  df_copy = df_copy.merge(loss, on=['Ticker', 'Date'], how='inner')
  df_copy['rsi'] = 100. - (100. / (1 + df_copy['gain']/df_copy['loss']))
  return df_copy

# Compute MACD
def compute_macd(df, short_window="12d", long_window="26d", signal_window="9d"):
  df_copy = df.copy()
  # Ensure Date is datetime
  df_copy['Date'] = pd.to_datetime(df_copy['Date'])

  # Group by Ticker and compute EMAs
  df_copy['short_ema'] = df_copy.groupby("Ticker").apply(
      lambda x: x['Close'].ewm(halflife=short_window, times=pd.to_datetime(x['Date'])).mean(),
      include_groups=False
  ).reset_index(drop=True)

  df_copy['long_ema'] = df_copy.groupby("Ticker").apply(
      lambda x: x['Close'].ewm(halflife=long_window, times=pd.to_datetime(x['Date'])).mean(),
      include_groups=False
  ).reset_index(drop=True)

  # Compute MACD line
  df_copy['macd'] = df_copy['short_ema'] - df_copy['long_ema']

  # Compute Signal line
  df_copy['signal'] = df_copy.groupby("Ticker").apply(
      lambda x: x['macd'].ewm(halflife=signal_window, times=pd.to_datetime(x['Date'])).mean(),
      include_groups=False
  ).reset_index(drop=True)

  return df_copy

# Data transformation
def data_transformer(df, window="20d", rsi_window="14d", short_window="12d", long_window="26d", signal_window="9d"):
  df_copy = df.copy()

  epsilon = 1e-10
  df_copy['r_open'] = np.log(df_copy['Open'].groupby(df_copy['Ticker']).pct_change().add(1 + epsilon))
  df_copy['r_high'] = np.log(df_copy['High'].groupby(df_copy['Ticker']).pct_change().add(1 + epsilon))
  df_copy['r_low'] = np.log(df_copy['Low'].groupby(df_copy['Ticker']).pct_change().add(1 + epsilon))
  df_copy['r_close'] = np.log(df_copy['Close'].groupby(df_copy['Ticker']).pct_change().add(1 + epsilon))

  df_copy['log_volume'] = np.log(df_copy['Volume'])
  df_copy['log_volume'] = df_copy['log_volume'].replace(np.float64("-inf"), 0.0)
  df_copy['Date'] = pd.to_datetime(df_copy['Date'])

  rolling_mean = df_copy.groupby("Ticker").rolling(window=window, on="Date")["log_volume"].mean().reset_index().rename(columns={"log_volume": "rolling_mean"})
  df_copy = df_copy.merge(rolling_mean, on=['Ticker', 'Date'], how='inner')
  df_copy['norm_log_volume'] = df_copy['log_volume'] / (df_copy['rolling_mean'] + epsilon)

  rolling_sum = df_copy.groupby("Ticker").rolling(window=window, on="Date")["r_close"].sum().reset_index().rename(columns={"r_close": "r_sum"})
  rolling_std = df_copy.groupby("Ticker").rolling(window=window, on="Date")["r_close"].std().reset_index().rename(columns={"r_close": "r_std"})
  df_copy = df_copy.merge(rolling_sum, on=['Ticker', 'Date'], how='inner')
  df_copy = df_copy.merge(rolling_std, on=['Ticker', 'Date'], how='inner')
  df_copy['r_cum'] = df_copy['r_sum'] / (df_copy['r_std'] + epsilon)

  df_copy = compute_rsi(df_copy, rsi_window=rsi_window)
  df_copy = compute_macd(df_copy, short_window=short_window, long_window=long_window, signal_window=signal_window)
  df_copy = df_copy.fillna(0.0)
  return df_copy

# Data Preprocessing

In [None]:
filename = "snp500_hist_cluster.csv"
hist_price_path = os.path.join(path, filename)

if not os.path.isfile(hist_price_path):
  print("Downloading data...\n")
  snp500_cons_hist_data = snp500_cons_cut_off.apply(
      lambda df:
      get_price(ticker=df.tickers,
                start=df.strat_start,
                end=df.strat_end,
                interval='1d'),
      axis=1
  )
  snp500_hist = pd.concat(snp500_cons_hist_data.values)

  rolling_window = '20d'
  rsi_window = '14d'
  short_window = '12d'
  long_window = '26d'
  signal_window = '9d'

  snp500_hist_transformed = data_transformer(
      snp500_hist,
      window=rolling_window,
      rsi_window=rsi_window,
      short_window=short_window,
      long_window=long_window,
      signal_window=signal_window
  )

  snp500_hist_transformed = snp500_hist_transformed[(snp500_hist_transformed['Date'] >= strat_start) & (snp500_hist_transformed['Date'] <= strat_end)].reset_index(drop=True)

  snp500_hist_transformed.to_csv(hist_price_path, index=False)
  print(f"Exported as {filename}")

Downloading data...



ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2024-04-01 -> 2025-04-30)')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')


Exported as snp500_hist_cluster.csv


In [None]:
snp500_hist_transformed = pd.read_csv(hist_price_path, encoding='utf-8')
features = ["r_close", "norm_log_volume", "rsi", "macd", "signal"]
n_features = len(features)
tickers = snp500_hist_transformed['Ticker'].unique()
time_steps = len(snp500_hist_transformed['Date'].unique())
n_stocks = len(tickers)

print(f"""
Input features: {features}
Number of input features: {n_features}
Number of tickers: {n_stocks}
Number of time_steps: {time_steps}

""")

Input features: ['r_close', 'norm_log_volume', 'rsi', 'macd', 'signal']
Number of input features: 5
Number of tickers: 500
Number of time_steps: 249



# Create Dataset

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
snp500_hist_transformed[features] = scaler.fit_transform(snp500_hist_transformed[features])

In [None]:
import tensorflow as tf

data = []
real_data_metadata = []
batch_size = config['batch_size']
for ticker in tickers:
  stock_data = snp500_hist_transformed[snp500_hist_transformed['Ticker'] == ticker].sort_values('Date')
  if len(stock_data) != time_steps:
      print(f"{ticker} has {len(stock_data)} time steps. Expected {time_steps}")
      sequence = stock_data[features].values
      # Pad data with 0 if fewer than time_steps
      if len(sequence) < time_steps:
          padding = np.zeros((time_steps - len(sequence), n_features))
          sequence = np.vstack([sequence, padding])
      # Truncate data if more than time_steps
      elif len(sequence) > time_steps:
          sequence = sequence[:time_steps]
  else:
      sequence = stock_data[features].values
  data.append(sequence)
  real_data_metadata.append({
      'Ticker': ticker,
      'StartDate': stock_data['Date'].iloc[0],
      'EndDate': stock_data['Date'].iloc[-1]
  })

data = np.array(data)
print(f"Data shape: {data.shape}")

# Convert to tensors
data_tensor = tf.convert_to_tensor(data, dtype=tf.float32)
dataset = tf.data.Dataset.from_tensor_slices(data_tensor).batch(batch_size, drop_remainder=False).prefetch(tf.data.AUTOTUNE)
print(f"Number of batches: {len(list(dataset))}")

real_data_metadata = pd.DataFrame(real_data_metadata)
print(f"Metadata shape: {real_data_metadata.shape}")

Data shape: (500, 249, 5)
Number of batches: 32
Metadata shape: (500, 3)
