In [51]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

In [52]:
# First create new label based on stock performance relative to other stocks and benchmark
# Use the direct path to the file in the environment
input_file = 'sp1500_company_with_history_wide_new.csv'
df_raw = pd.read_csv(input_file)


# use log return - measurement of investment performance by ratio of final price to initial price
def log_return(final, initial):
    return np.log(final / initial)


# determine which dates exist in benchmark and company data

all_close_cols = [c for c in df_raw.columns if c.startswith("close_")]
trade_days = sorted(c.split("_", 1)[1] for c in all_close_cols)


# build relative daily returns feature in loop
log_cols = []

for prev_key, curr_key in zip(trade_days[:-1], trade_days[1:]):
    close_prev = f"close_{prev_key}"
    close_curr = f"close_{curr_key}"

    # name of new feature for relative return
    feature_name = f"log_ret{curr_key}"

    # stock daily log-return for the day
    df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])

    log_cols.append(feature_name)



df_regime_features = df_raw.copy()


# add distance to 52 week extremes
rng = (df_regime_features['fiftyTwoWeekHigh'] - df_regime_features['fiftyTwoWeekLow']).replace(0, np.nan)
df_regime_features['pos_in_52w'] = (df_regime_features['currentPrice'] - df_regime_features['fiftyTwoWeekLow']) / (rng + 1e-9)

# add distance to moving averages
df_regime_features['dist_ma50']  = (df_regime_features['currentPrice'] - df_regime_features['fiftyDayAverage']) / (df_regime_features['fiftyDayAverage'] + 1e-9)
df_regime_features['dist_ma200'] = (df_regime_features['currentPrice'] - df_regime_features['twoHundredDayAverage']) / (df_regime_features['twoHundredDayAverage'] + 1e-9)
df_regime_features['ma_cross']   = (df_regime_features['fiftyDayAverage'] - df_regime_features['twoHundredDayAverage']) / (df_regime_features['twoHundredDayAverage'] + 1e-9)

# add liquidity size, shares outstanding ratios
df_regime_features['log_mcap']  = np.log(df_regime_features['marketCap'] + 1)
# high outstanding shares = lots of flots is short and performance may be poor
df_regime_features['float_to_out'] = df_regime_features['floatShares'] / (df_regime_features['sharesOutstanding'] + 1e-9)

ret_df = df_regime_features[log_cols]

# add rolling momentum
df_regime_features["mom_20"] = ret_df.iloc[:, -20:].mean(axis=1)
df_regime_features["mom_60"] = ret_df.iloc[:, -60:].mean(axis=1)
df_regime_features["mom_120"] = ret_df.iloc[:, -120:].mean(axis=1)

# add rolling volatility - essential for RS-DBN
df_regime_features["voli_20"] = ret_df.iloc[:, -20:].std(axis=1)
df_regime_features["voli_60"] = ret_df.iloc[:, -60:].std(axis=1)

# add drawdown for bear detection
cum_ret = ret_df.cumsum(axis=1)

rolling_max = cum_ret.max(axis=1)
df_regime_features["max_drawdown"] = (cum_ret.iloc[:, -1] - rolling_max) / (rolling_max + 1e-9)

# add trend vs noise for buy the dip logic
df_regime_features["trend_strength"] = df_regime_features["mom_60"] / (df_regime_features["voli_60"] + 1e-9)

# add price to VWAP ratio 

for day in trade_days:
    # Column names
    col_close = f"close_{day}"
    col_vwap  = f"vwap_{day}"
    col_high  = f"high_{day}"
    col_low   = f"low_{day}"
    
    # if price > VWAP, buyers controlled the day
    if col_vwap in df_raw.columns:
        df_regime_features[f"price_to_vwap_{day}"] = df_raw[col_close] / (df_raw[col_vwap] + 1e-9)
        
    # 2. Intraday Volatility (Parkinson Volatility Proxy)
    # log(High/Low) is better than (High-Low)/Close for statistical properties
    df_regime_features[f"log_range_{day}"] = np.log(df_raw[col_high] / (df_raw[col_low] + 1e-9))

# add relative volume (stock liquidity)
# Get all volume columns in chronological order
vol_cols = [f"volume_{day}" for day in trade_days if f"volume_{day}" in df_raw.columns]

if vol_cols:
    # Extract volume sub-dataframe
    vol_df = df_raw[vol_cols]
    
    # Calculate 20-day rolling average of volume (axis=1 for columns)
    # We use a window of 20 days (approx 1 trading month)
    vol_rolling_mean = vol_df.rolling(window=20, axis=1).mean()
    
    # Calculate Relative Volume (Current / Average)
    # Fill NaN at the start (first 19 days) with 1.0 or similar
    rel_vol_df = vol_df / (vol_rolling_mean + 1e-9)
    
    # Rename columns to avoid collision and merge back
    rel_vol_df.columns = [c.replace("volume_", "rel_vol_") for c in rel_vol_df.columns]
    
    # Concatenate to main dataframe
    df_regime_features = pd.concat([df_regime_features, rel_vol_df], axis=1)


# add overnight gap column

# Create a list to track the new gap columns
gap_cols = []

# 1. Loop through days to calculate the gap for Day 2 to Day N
# We zip(trade_days[:-1], trade_days[1:]) so 'prev' is Day 1 and 'curr' is Day 2
for prev_key, curr_key in zip(trade_days[:-1], trade_days[1:]):
    col_open_curr = f"open_{curr_key}"
    col_close_prev = f"close_{prev_key}"
    
    feature_name = f"overnight_gap_{curr_key}"
    
    # Calculate Log Gap: log(Open_today / Close_yesterday)
    # Using log returns ensures symmetry and handles price scale differences
    df_regime_features[feature_name] = np.log(df_raw[col_open_curr] / (df_raw[col_close_prev] + 1e-9))
    
    gap_cols.append(feature_name)


  df_raw = pd.read_csv(input_file)
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr], df_raw[close_prev])
  df_raw[feature_name] = log_return(df_raw[close_curr

In [53]:
# now add the new label and clean the dataframe by dropping low value columns

def clean_data(df):
    df_clean = df.copy()
    # remove columns that have no value for analysis, have large amount of
    # missing data
    drop_cols = ['companyOfficers', 'website', 'phone', 'irWebsite',
                 'longBusinessSummary', 'address1', 'tradeable', 'quoteType',
                 'language', 'region', 'quoteSourceName', 'triggerable', 
                 'customPriceAlertConfidence', 'marketState',
                 'exchangeDataDelayedBy', 'sourceInterval', 'cryptoTradeable',
                 'shortName', 'longName', 'hasPrePostMarketData',
                 'corporateActions', 'messageBoardId', 'exchangeTimezoneName',
                 'exchangeTimezoneShortName', 'gmtOffSetMilliseconds', 'fax',
                 'market', 'esgPopulated', 'address2', 'displayName',
                 'ipoExpectedDate', 'prevName', 'nameChangeDate',
                 'industrySymbol', 'trailingPegRatio', 'lastSplitDate', 
                 'lastSplitFactor']

    df_clean.drop(drop_cols, inplace=True, axis=1)
    return df_clean

df_cleaned = clean_data(df_regime_features)
# show all columns
print(df_cleaned.columns.tolist())
print(df_cleaned['rel_vol_20250717'])

['city', 'state', 'zip', 'country', 'industry', 'industryKey', 'industryDisp', 'sector', 'sectorKey', 'sectorDisp', 'fullTimeEmployees', 'auditRisk', 'boardRisk', 'compensationRisk', 'shareHolderRightsRisk', 'overallRisk', 'governanceEpochDate', 'compensationAsOfEpochDate', 'executiveTeam', 'maxAge', 'priceHint', 'previousClose', 'open', 'dayLow', 'dayHigh', 'regularMarketPreviousClose', 'regularMarketOpen', 'regularMarketDayLow', 'regularMarketDayHigh', 'dividendRate', 'dividendYield', 'exDividendDate', 'payoutRatio', 'fiveYearAvgDividendYield', 'beta', 'trailingPE', 'forwardPE', 'volume', 'regularMarketVolume', 'averageVolume', 'averageVolume10days', 'averageDailyVolume10Day', 'bid', 'ask', 'bidSize', 'askSize', 'marketCap', 'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 'allTimeHigh', 'allTimeLow', 'priceToSalesTrailing12Months', 'fiftyDayAverage', 'twoHundredDayAverage', 'trailingAnnualDividendRate', 'trailingAnnualDividendYield', 'currency', 'enterpriseValue', 'profitMargins', 'floatShare

Now that we have the first part of the dataset we can begin setting up the Regime-Switching Dynamic Bayesian Network (RS-DBN)

First it is easier for the regime engine to have the time series data independent in a long format

In [54]:
ID_COL = "symbol" 

# Sanity: last log return column is "log_ret<last_day>"
last_log_ret_col = log_cols[-1]
latest_day = last_log_ret_col.replace("log_ret", "")

def build_panel_from_wide(df_wide, trade_days, id_col=ID_COL):
    # convert wide dataframe into long panel with one row per stock and day
    panel_rows = []

    for day in trade_days:
        row = pd.DataFrame()
        row[id_col] = df_wide[id_col]
        row["date"] = pd.to_datetime(day)

        # Core daily features (must exist)
        row["log_ret"] = df_wide.get(f"log_ret{day}")
        row["overnight_gap"] = df_wide.get(f"overnight_gap_{day}")
        row["log_range"] = df_wide.get(f"log_range_{day}")
        row["rel_vol"] = df_wide.get(f"rel_vol_{day}")

        panel_rows.append(row)

    panel_df = pd.concat(panel_rows, ignore_index=True)

    # drop rows where we are missing the main signal (log_ret)
    panel_df = panel_df.dropna(subset=["log_ret"])

    # sort by stock then date so sequences are ordered
    panel_df = panel_df.sort_values([id_col, "date"]).reset_index(drop=True)

    return panel_df


panel_df = build_panel_from_wide(df_cleaned, trade_days)
print(panel_df.tail().to_string(index=False))


symbol       date   log_ret  overnight_gap  log_range  rel_vol
   ZWS 2025-11-28 -0.008974       0.006833   0.016220 0.648874
   ZWS 2025-12-01  0.000838      -0.011173   0.021506 0.833960
   ZWS 2025-12-02  0.001674       0.005015   0.012602 1.170634
   ZWS 2025-12-03 -0.007767       0.001254   0.016730 0.960977
   ZWS 2025-12-04 -0.011233      -0.000632   0.018532 1.102343


  panel_df = pd.concat(panel_rows, ignore_index=True)


We will now do a Gaussian HMM over the small set of daily features with an HMM emission model to get the hidden states

In [55]:
emission_cols = ["log_ret", "log_range", "rel_vol", "overnight_gap"]

# remove all rows with any NaN values in the emission features
mask = panel_df[emission_cols].notna().all(axis=1)
panel_hmm = panel_df[mask].copy()

# build X and lengths from panel_hmm
X_raw = panel_hmm[emission_cols].values

# standardize the values
scaler_hmm = StandardScaler()
X = scaler_hmm.fit_transform(X_raw)

lengths = (
    panel_hmm.groupby(ID_COL)["date"]
    .size()
    .sort_index()
    .to_list()
)

# to speed up convergence we can initialize means of log_ret instead of giving HMM random init

panel_hmm['log_ret_std'] = X[:, 0] # first column is scaled log_ret

panel_hmm_sorted = panel_hmm.sort_values('log_ret_std')
n = len(panel_hmm_sorted)
third = n //3

# get the three slices
bear_slice = panel_hmm_sorted.iloc[:third]
sideways_slice = panel_hmm_sorted.iloc[third:2*third]
bull_slice = panel_hmm_sorted.iloc[2*third:]

# build initial means in standardized feature space
X_df = pd.DataFrame(X, index=panel_hmm.index, columns=emission_cols)
mu_bear = X_df.loc[bear_slice.index].mean().values
mu_side = X_df.loc[sideways_slice.index].mean().values
mu_bull = X_df.loc[bull_slice.index].mean().values

init_means = np.vstack([mu_bear, mu_side, mu_bull])

# get covariances estimated from slice
def cov_from_slice(idx):
    X_s = X_df.loc[idx].values
    return np.cov(X_s.T) + 1e-6 * np.eye(X_s.shape[1])

cov_bear = cov_from_slice(bear_slice.index)
cov_side = cov_from_slice(sideways_slice.index)
cov_bull = cov_from_slice(bull_slice.index)

# initialize covariates
init_covars = np.stack([cov_bear, cov_side, cov_bull], axis=0)

# fit 3-state Gaussian HMM
hmm_model = GaussianHMM(
    n_components=3,
    covariance_type="full",
    n_iter=200,
    min_covar=1e-5,
    init_params='st', # don't re-init means or covars since we set them
    params='stmc',
    random_state=123
)

hmm_model.means_ = init_means
hmm_model.covars_ = init_covars
hmm_model.fit(X, lengths)

# posterior probabilities P(state k | observations)
posteriors = hmm_model.predict_proba(X)
states = hmm_model.predict(X)  # Viterbi state sequence

panel_hmm["regime_raw"] = states
panel_hmm[["p_state0", "p_state1", "p_state2"]] = posteriors

hmm_cols = ['regime_raw', "p_state0", "p_state1", "p_state2"]
# drop duplicate cols from panel df if they exist from previous run
panel_df = panel_df.drop(columns=hmm_cols, errors='ignore')

# merge HMM outputs back onto full panel_df by (symbol, date)
panel_df = panel_df.merge(
    panel_hmm[[ID_COL, "date", "regime_raw", "p_state0", "p_state1", "p_state2"]],
    on=[ID_COL, "date"],
    how="left",
)


In [56]:
# compute mean return per raw state
state_means = (
    panel_df.dropna(subset=["regime_raw"])
            .groupby("regime_raw")["log_ret"]
            .mean()
            .sort_values()
)

# force regimes to int
ordered_states = [int(s) for s in state_means.index.to_list()]

state_to_label = {
    ordered_states[0]: "bear",
    ordered_states[1]: "sideways",
    ordered_states[2]: "bull",
}

panel_df["regime_raw_int"] = panel_df["regime_raw"].astype("Int64")
panel_df["regime_label"] = panel_df["regime_raw_int"].map(state_to_label)

# map posterior columns to bull/bear/sideways
label_to_state = {v: k for k, v in state_to_label.items()}  # values are ints now

for label, k in label_to_state.items():
    panel_df[f"p_{label}"] = panel_df[f"p_state{int(k)}"]


# latest date in the panel
latest_date = panel_df["date"].max()

latest_slice = panel_df.loc[
    (panel_df["date"] == latest_date)
    & panel_df["log_ret"].notna()
    & panel_df["p_bull"].notna()
].copy()

# only look at names where price fell on the last day
latest_slice = latest_slice[latest_slice["log_ret"] < 0]

# score = "buy-the-dip" signal
latest_slice["signal_score"] = (
    (latest_slice["p_bull"] + latest_slice["p_sideways"]) * (-latest_slice["log_ret"])
)

latest_slice = latest_slice.sort_values("signal_score", ascending=False)

top_candidates = latest_slice[[ID_COL, "log_ret", "p_bull", "p_sideways", "signal_score"]]
print(top_candidates.head(10))


       symbol   log_ret        p_bull  p_sideways  signal_score
248261    OLN -0.048231  4.641672e-12    0.969129      0.046742
114003   ENOV -0.046130  7.461285e-12    0.966444      0.044582
341728   URBN -0.044676  3.146695e-07    0.982800      0.043907
212836     MD -0.043886  6.709285e-11    0.960928      0.042171
367375   WYNN -0.042636  2.499703e-07    0.968084      0.041275
68338     CHH -0.042341  1.612213e-11    0.965542      0.040882
302414    SLG -0.040593  5.970336e-08    0.995548      0.040412
23203     ANF -0.040462  6.507342e-09    0.991723      0.040127
192433    KOP -0.040180  1.697122e-08    0.993350      0.039913
42193     BDN -0.039944  4.405570e-06    0.996635      0.039810


Next we compute the RS-DBN
First create transition dataset - given previous regime and current features, what is next regime?

In [57]:
# pick columns that are slow moving relative to daily returns, stock specific and economically interpretable as regime persistance drivers
fundamental_cols = [
    "log_mcap",
    "float_to_out",
    "pos_in_52w",
    "dist_ma50",
    "dist_ma200",
    "ma_cross",
    "mom_20",
    "mom_60",
    "mom_120",
    "voli_20",
    "voli_60",
    "trend_strength",
    "max_drawdown",
    "trailingPE",
    "forwardPE",
    "priceToBook",
    "profitMargins",
    "operatingMargins",
    "returnOnEquity",
    "returnOnAssets",
]
# create the fundamental dataframe
fund_df = df_cleaned[[ID_COL] + fundamental_cols]
# merge the fundamental dataframe with the panel dataframe
panel_with_fund = panel_df.merge(fund_df, on=ID_COL, how="left")
# sort the values
panel_with_fund = panel_with_fund.sort_values([ID_COL, "date"])
# create a new label for previous regime
panel_with_fund["prev_regime_raw"] = (
    panel_with_fund.groupby(ID_COL)["regime_raw"].shift(1)
)

# add 3 day cumulative log returns
panel_with_fund['drop_3d'] = (
    panel_with_fund.groupby(ID_COL)['log_ret']
    .rolling(3).sum()
    .reset_index(level=0, drop=True)
)

# creates new dataframe and drops null values
transitions_df = panel_with_fund.dropna(subset=["prev_regime_raw", "regime_raw"]).copy()

transitions_df_clean = transitions_df.copy()

# replace infinities with NaN
transitions_df_clean[fundamental_cols] = transitions_df_clean[fundamental_cols].replace(
    [np.inf, -np.inf],
    np.nan
)

transitions_df_clean = transitions_df_clean.dropna(subset=fundamental_cols)


In [58]:
# now run multinominal logistic regression on new dataframe to predict regime_raw from prev_regime_raw

X_fund = transitions_df_clean[fundamental_cols]
prev_regime = transitions_df_clean['prev_regime_raw'].astype(int)
next_regime = transitions_df_clean['regime_raw'].astype(int)

# scale the fundamental data
scaler = StandardScaler()
X_fund_scaled = scaler.fit_transform(X_fund)

# get the onehot values of prev regime
prev_regime_onehot = pd.get_dummies(prev_regime, prefix='prev_regime', dtype=int)
X_all = np.hstack([X_fund_scaled, prev_regime_onehot.values])

# add class weights as bull case is rare
classes = np.unique(next_regime)
class_weights = dict(
    zip(classes, compute_class_weight('balanced', classes=classes, y=next_regime))
)

# now run multinominal logistic regression
logit_trans = LogisticRegression(
    multi_class='multinomial',
    penalty='elasticnet', # encourages group sparsity, stabilizing transitions
    solver='saga', # required for elastic net
    l1_ratio=0.2, 
    C=0.3, # stronger regularization than usual to reduce regime-flip noise
    class_weight=class_weights,
    max_iter=2000,
    random_state=123,
)

logit_trans.fit(X_all, next_regime)
# get the states/class order used by logistic regression
logit_classes = logit_trans.classes_
n_states = len(logit_classes)




Now we must combine both models
1. HMM gives current regime distribution
2. Logistic model gives fundamentals-conditioned transition matrix

We must now combine them -> map states and define bullish_stickiness as 
bullish_stickiness = P(bull) + P(sideways)

The final buy-the-dip score is given by bullish_stickiness X * (-drop_3d)

In [59]:
# map state index -> position in class_ array
state_to_classpos = {int(s): int(i) for i, s in enumerate(logit_classes)}

def compute_adjusted_probs_row(row):
    # if fundamentals missing, fall back to HMM probabilities
    if row[fundamental_cols].isna().any():
        out = {}
        for label, state in label_to_state.items():
            out[f"p_{label}_adj"] = row.get(f"p_{label}", np.nan)
        return pd.Series(out)
    
    # scale the fundamental columns
    f_vec = row[fundamental_cols].values.reshape(1, -1)
    f_scaled = scaler.transform(f_vec)

    # build a fundamental-conditioned transition matrix
    A = np.zeros((n_states, n_states))

    # one-hot basis for prev_regime in classes_ order
    eye = np.eye(n_states, dtype=float)

    for i, prev_state in enumerate(logit_classes):
        one_hot_prev = eye[i].reshape(1, -1) # length of n_states
        X_row = np.hstack([f_scaled, one_hot_prev])
        probs_next = logit_trans.predict_proba(X_row)[0]

        A[int(prev_state), :] = probs_next
    
    # get the current HMM posterior pi_t in state-index order
    pi_t = np.zeros(n_states)
    for s in logit_classes:
        pi_t[state_to_classpos[int(s)]] = row.get(f'p_state{int(s)}', 0.0)

    
    pi_next = pi_t @ A

    out = {}
    for label, state in label_to_state.items():
        idx = state_to_classpos[int(state)]
        out[f"p_{label}_adj"] = pi_next[idx]

    return pd.Series(out)
    

In [63]:
# compute final score and print top candidates

latest_date = panel_with_fund['date'].max()

# get latest slice to be stocks with drop in past 3 days
latest_slice = panel_with_fund.loc[
    (panel_with_fund['date'] == latest_date)
    & panel_with_fund['drop_3d'].notna()
    & panel_with_fund['p_bull'].notna()
].copy()

# only consider names where price fell in the last 3 days
latest_slice = latest_slice[latest_slice['drop_3d'] < 0]

adj_probs = latest_slice.apply(compute_adjusted_probs_row, axis=1)
latest_slice = pd.concat([latest_slice, adj_probs], axis=1)

# compute final RS-DBD score:
latest_slice['signal_score_adj'] = (
    (latest_slice['p_bull_adj'] + latest_slice['p_sideways_adj'])
    * (-latest_slice['drop_3d'])
)

# rank by adjusted score
latest_slice = latest_slice.sort_values('signal_score_adj', ascending=False)

top_candidates = latest_slice[
    [ID_COL,
    'drop_3d',
    'p_bull_adj', 'p_sideways_adj',
    'signal_score_adj']
]

# print the top 10 companies 
print(top_candidates.head(10).to_string(index=False))



symbol   drop_3d  p_bull_adj  p_sideways_adj  signal_score_adj
   SIG -0.118332    0.338181        0.335740          0.079746
  TMDX -0.081190    0.399116        0.359228          0.061570
    MD -0.077243    0.401981        0.380109          0.060411
   SLG -0.069832    0.349121        0.493307          0.058828
  SHAK -0.072216    0.318197        0.476193          0.057368
   DOC -0.057126    0.590193        0.405822          0.056898
   COR -0.064487    0.344104        0.506495          0.054852
   CPK -0.059625    0.310782        0.605874          0.054655
  AVTR -0.052414    0.008177        0.991362          0.052390
   NWN -0.056990    0.316924        0.601397          0.052335




In [None]:
debug_cols = (
    [ID_COL, "date", "log_ret", "drop_3d"] +
    [f"p_state{k}" for k in range(n_states)])

# Make sure we only keep columns that actually exist
debug_cols = [c for c in debug_cols if c in latest_slice.columns]

debug_view = latest_slice[debug_cols].head(10)
print(debug_view.to_string(index=False))

symbol       date   log_ret   drop_3d  p_state0  p_state1     p_state2
   SIG 2025-12-04 -0.045975 -0.118332  0.231802  0.768198 2.658794e-12
  TMDX 2025-12-04 -0.010115 -0.081190  0.014027  0.950867 3.510659e-02
    MD 2025-12-04 -0.043886 -0.077243  0.039072  0.960928 6.709285e-11
   SLG 2025-12-04 -0.040593 -0.069832  0.004452  0.995548 5.970336e-08
  SHAK 2025-12-04 -0.037842 -0.072216  0.128835  0.871165 1.313646e-10
   DOC 2025-12-04 -0.006382 -0.057126  0.003985  0.405822 5.901929e-01
   COR 2025-12-04  0.006895 -0.064487  0.003140  0.704691 2.921684e-01
   CPK 2025-12-04 -0.016922 -0.059625  0.000029  0.993139 6.832308e-03
  AVTR 2025-12-04 -0.012478 -0.052414  0.000461  0.991362 8.176698e-03
   NWN 2025-12-04 -0.035084 -0.056990  0.000420  0.999580 3.276775e-07


In [None]:
# transition state diagram
debug_cols = (
    [ID_COL] +
    [f"p_state{k}" for k in range(n_states)])

# Make sure we only keep columns that actually exist
debug_cols = [c for c in debug_cols if c in latest_slice.columns]

debug_view = latest_slice[debug_cols].head(10)
print(debug_view.to_string(index=False))

symbol  p_state0  p_state1     p_state2
   SIG  0.231802  0.768198 2.658794e-12
  TMDX  0.014027  0.950867 3.510659e-02
    MD  0.039072  0.960928 6.709285e-11
   SLG  0.004452  0.995548 5.970336e-08
  SHAK  0.128835  0.871165 1.313646e-10
   DOC  0.003985  0.405822 5.901929e-01
   COR  0.003140  0.704691 2.921684e-01
   CPK  0.000029  0.993139 6.832308e-03
  AVTR  0.000461  0.991362 8.176698e-03
   NWN  0.000420  0.999580 3.276775e-07


In [68]:
debug_cols = (
    [ID_COL, "prev_regime_raw"])

# Make sure we only keep columns that actually exist
debug_cols = [c for c in debug_cols if c in latest_slice.columns]

debug_view = latest_slice[debug_cols].head(10)
print(debug_view.to_string(index=False))

symbol  prev_regime_raw
   SIG              2.0
  TMDX              1.0
    MD              1.0
   SLG              1.0
  SHAK              1.0
   DOC              0.0
   COR              0.0
   CPK              1.0
  AVTR              1.0
   NWN              1.0


In [None]:
# final columns
debug_cols = (
    [ID_COL, "p_bull_adj", "p_sideways_adj", "signal_score_adj"])

# Make sure we only keep columns that actually exist
debug_cols = [c for c in debug_cols if c in latest_slice.columns]

debug_view = latest_slice[debug_cols].head(10)
print(debug_view.to_string(index=False))

symbol  p_bull_adj  p_sideways_adj  signal_score_adj
   SIG    0.338181        0.335740          0.079746
  TMDX    0.399116        0.359228          0.061570
    MD    0.401981        0.380109          0.060411
   SLG    0.349121        0.493307          0.058828
  SHAK    0.318197        0.476193          0.057368
   DOC    0.590193        0.405822          0.056898
   COR    0.344104        0.506495          0.054852
   CPK    0.310782        0.605874          0.054655
  AVTR    0.008177        0.991362          0.052390
   NWN    0.316924        0.601397          0.052335
