In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pandas.tseries.offsets import BDay
import warnings
warnings.filterwarnings("ignore")
import json
from IPython.display import display_html

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Colab Notebooks/Imperial MLDS/DeepTimeSeriesClustering'
os.chdir(path)

Mounted at /content/drive


In [None]:
params_filename = "params.json"
params_path = os.path.join(path, params_filename)

with open(params_path, 'r') as f:
  config = json.load(f)

In [None]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score, davies_bouldin_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, MultiHeadAttention, LayerNormalization
from tensorflow.keras.regularizers import L2
from tensorflow.keras.metrics import Mean
import tensorflow.keras.backend as ops
from hdbscan import HDBSCAN

# Synthetic data's market regime statistics
cluster_params = {
    # Bullish
    0: {
        'r_close_mean': 0.02, 'r_close_std': 0.01,
        'norm_log_volume_mean': 1.5, 'norm_log_volume_std': 0.1,
        'rsi_mean': 70.0, 'rsi_std': 10.0,
        'macd_mean': 10.0, 'macd_std': 2.0,
        'signal_mean': 8.0, 'signal_std': 1.5
    },
    # Neutral
    1: {
        'r_close_mean': 0.0, 'r_close_std': 0.005,
        'norm_log_volume_mean': 1.0, 'norm_log_volume_std': 0.05,
        'rsi_mean': 50.0, 'rsi_std': 5.0,
        'macd_mean': 0.0, 'macd_std': 1.0,
        'signal_mean': 0.0, 'signal_std': 0.8
    },
    # Bearish
    2: {
        'r_close_mean': -0.02, 'r_close_std': 0.01,
        'norm_log_volume_mean': 0.7, 'norm_log_volume_std': 0.1,
        'rsi_mean': 30.0, 'rsi_std': 10.0,
        'macd_mean': -10.0, 'macd_std': 2.0,
        'signal_mean': -8.0, 'signal_std': 1.5
    }
}

# Function to generate synthetic data
def generate_synthetic_stock_data(n_stocks, cluster_params, time_steps, n_features, n_clusters):
    stocks_per_cluster = n_stocks // n_clusters
    dates = pd.date_range(start='2024-06-01', periods=time_steps, freq='B')
    tickers = [f"STOCK_{i}" for i in range(n_stocks)]

    raw_data = []
    labels = np.zeros(n_stocks, dtype=int)

    for cluster in range(n_clusters):
      start_idx = cluster * stocks_per_cluster
      end_idx = (cluster + 1) * stocks_per_cluster if cluster < n_clusters - 1 else n_stocks
      n = end_idx - start_idx

      # Sampling using normal distribution
      r_close = np.random.normal(
          loc=cluster_params[cluster]['r_close_mean'],
          scale=cluster_params[cluster]['r_close_std'],
          size=(n, time_steps)
      )
      r_close[:, 1:] = 0.8 * r_close[:, :-1] + 0.2 * r_close[:, 1:]
      r_close = np.clip(r_close, -0.15, 0.15)

      norm_log_volume = np.random.normal(
          loc=cluster_params[cluster]['norm_log_volume_mean'],
          scale=cluster_params[cluster]['norm_log_volume_std'],
          size=(n, time_steps)
      )
      norm_log_volume = np.clip(norm_log_volume, 0.5, 2.0) # Clipping for simplicity and stability

      rsi = np.random.normal(
          loc=cluster_params[cluster]['rsi_mean'],
          scale=cluster_params[cluster]['rsi_std'],
          size=(n, time_steps)
      )
      rsi = np.clip(rsi, 0.0, 100.0)

      macd = np.random.normal(
          loc=cluster_params[cluster]['macd_mean'],
          scale=cluster_params[cluster]['macd_std'],
          size=(n, time_steps)
      )
      macd = np.clip(macd, -100.0, 100.0)

      signal = np.random.normal(
          loc=cluster_params[cluster]['signal_mean'],
          scale=cluster_params[cluster]['signal_std'],
          size=(n, time_steps)
      )
      signal = np.clip(signal, -100.0, 100.0)

      labels[start_idx:end_idx] = cluster

      for i in range(n):
        for t in range(time_steps):
          raw_data.append({
              'Ticker': tickers[start_idx + i],
              'Date': dates[t],
              'r_close': r_close[i, t],
              'norm_log_volume': norm_log_volume[i, t],
              'rsi': rsi[i, t],
              'macd': macd[i, t],
              'signal': signal[i, t]
          })

    df = pd.DataFrame(raw_data)

    features = ['r_close', 'norm_log_volume', 'rsi', 'macd', 'signal']
    print("Synthetic Dataset Statistics Before Scaling:")
    display(df[features].describe())
    print("NaN Counts:")
    display(df[features].isna().sum())

    data_tensor = np.zeros((n_stocks, time_steps, len(features)))
    for i, ticker in enumerate(tickers):
      stock_data = df[df['Ticker'] == ticker].sort_values('Date')[features].values
      stock_data = np.nan_to_num(stock_data, nan=0.0)
      if stock_data.shape[0] >= time_steps:
        data_tensor[i] = stock_data[-time_steps:]
      else:
        padded = np.zeros((time_steps, len(features)))
        padded[-stock_data.shape[0]:] = stock_data
        data_tensor[i] = padded

    scaler = StandardScaler()
    data_tensor_reshaped = data_tensor.reshape(-1, len(features))
    data_tensor_reshaped = np.nan_to_num(data_tensor_reshaped, nan=0.0)
    data_tensor_reshaped = scaler.fit_transform(data_tensor_reshaped)
    data_tensor_reshaped = np.nan_to_num(data_tensor_reshaped, nan=0.0)
    data_tensor = data_tensor_reshaped.reshape(n_stocks, time_steps, len(features))

    print("Synthetic Dataset Statistics After Scaling:")
    display(pd.DataFrame(data_tensor_reshaped, columns=features).describe())
    print("NaN Counts After Scaling:", np.isnan(data_tensor).sum())

    return data_tensor, df, labels, scaler

def compute_momentum_scores_df(df, lookbacks=[5, 10, 20]):
    momentum_scores = []
    for ticker in df['Ticker'].unique():
      stock_data = df[df['Ticker'] == ticker].sort_values('Date')
      returns = stock_data['r_close'].values
      volume = stock_data['norm_log_volume'].values
      ticker_scores = []
      for lookback in lookbacks:
        if len(returns) >= lookback:
          volume_mean = np.mean(volume[-lookback:]) + 1e-6
          volume_weights = volume[-lookback:] / volume_mean
          cum_return = np.sum(returns[-lookback:] * volume_weights)
          ticker_scores.append(cum_return)
        else:
          print(f"Warning: {ticker} has {len(returns)} time steps, insufficient for lookback {lookback}")
          ticker_scores.append(0.0)
      momentum_scores.append(np.mean(ticker_scores) if ticker_scores else 0.0)
    return np.array(momentum_scores)

# Parameters
n_stocks = 500
time_steps = 249
n_features = 5
n_clusters = 3
latent_dim = config['latent_dim']
num_heads = config['num_heads']
batch_size = config['batch_size']


# Generate synthetic dataset
synthetic_data_tensor, synthetic_df, ground_truth_labels, scaler = generate_synthetic_stock_data(
    n_stocks, cluster_params, time_steps, n_features, n_clusters
)

# Compute momentum scores for synthetic data
momentum_scores = compute_momentum_scores_df(synthetic_df)
synthetic_df['Momentum'] = np.repeat(momentum_scores, time_steps)

Synthetic Dataset Statistics Before Scaling:


Unnamed: 0,r_close,norm_log_volume,rsi,macd,signal
count,124500.0,124500.0,124500.0,124500.0,124500.0
mean,-8.1e-05,1.065341,49.937303,-0.041012,-0.040367
std,0.017817,0.340964,18.459061,8.361421,6.670272
min,-0.051199,0.5,0.0,-19.147101,-14.312054
25%,-0.014547,0.765463,36.517074,-8.692711,-7.042206
50%,-0.000123,0.999291,49.945256,-0.021801,-0.004424
75%,0.014356,1.431969,63.331784,8.640704,6.963966
max,0.058938,1.998422,100.0,18.539733,14.146815


NaN Counts:


Unnamed: 0,0
r_close,0
norm_log_volume,0
rsi,0
macd,0
signal,0


Synthetic Dataset Statistics After Scaling:


Unnamed: 0,r_close,norm_log_volume,rsi,macd,signal
count,124500.0,124500.0,124500.0,124500.0,124500.0
mean,2.803362e-15,-1.731327e-15,1.4183e-14,4.934648e-15,-5.763786e-15
std,1.000004,1.000004,1.000004,1.000004,1.000004
min,-2.869078,-1.658074,-2.705311,-2.285038,-2.139605
25%,-0.8119307,-0.8795053,-0.7270295,-1.03472,-1.049712
50%,-0.002337116,-0.1937165,0.0004308534,0.002297665,0.005388536
75%,0.8102857,1.075271,0.7256347,1.038311,1.050086
max,3.312527,2.736605,2.712104,2.222208,2.126935


NaN Counts After Scaling: 0
