In [20]:
import pandas as pd
from regime_ml.data.common.loaders import load_dataframe
from regime_ml.utils.config import load_configs
from regime_ml.regimes.hmm import HMMRegimeDetector

In [21]:
macro_cfg = load_configs()["macro_data"]["regime_universe"]
feat_path = macro_cfg["ready_features_path"]

In [22]:
df_feat = load_dataframe(feat_path)
df_feat

Unnamed: 0_level_0,T10Y3M_level_zscore_252,VIXCLS_level_zscore_63,NFCI_level_zscore_50,PCEPILFE_yoy_12_zscore_36,CFNAI_level_zscore_36
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-09-01,-2.782093,-0.740279,0.677723,0.887659,0.670846
2004-09-02,-2.545516,-1.203928,0.677723,0.887659,0.670846
2004-09-03,-2.320569,-1.444441,0.705355,0.887659,0.670846
2004-09-07,-2.466847,-1.285177,0.705355,0.887659,0.670846
2004-09-08,-2.677593,-1.267462,0.705355,0.887659,0.670846
...,...,...,...,...,...
2026-01-16,2.500448,-0.502612,-1.925796,-0.615876,-0.659302
2026-01-20,2.619164,1.063286,-1.925796,-0.615876,-0.659302
2026-01-21,2.373479,-0.096150,-1.925796,-0.615876,-0.659302
2026-01-22,2.291959,-0.553236,-1.925796,-0.615876,-0.659302


In [92]:
from regime_ml.regimes.evaluation import compare_models
import numpy as np

transmat_prior = np.eye(4) * 10 + np.ones((4, 4))

models = {
    'HMM_4_1': HMMRegimeDetector(n_regimes=4), #transmat_prior=transmat_prior),
    'HMM_4_2': HMMRegimeDetector(n_regimes=4, random_state=50, transmat_prior=transmat_prior),
    'HMM_4_3': HMMRegimeDetector(n_regimes=4, random_state=100, transmat_prior=transmat_prior),
    'HMM_4_4': HMMRegimeDetector(n_regimes=4, random_state=150, transmat_prior=transmat_prior),
    'HMM_4_5': HMMRegimeDetector(n_regimes=4, random_state=200, transmat_prior=transmat_prior),
}

comparison = compare_models(df_feat, models)
comparison

Unnamed: 0,model,log_likelihood,bic,aic,avg_persistence,n_transitions,entropy
1,HMM_4_2,-35365.685398,71590.061377,70931.370796,73.438356,72,1.97381
0,HMM_4_1,-35423.990415,71706.67141,71047.98083,67.0125,79,1.986308
3,HMM_4_4,-35499.158513,71857.007607,71198.317027,72.445946,73,1.981295
4,HMM_4_5,-35524.513059,71907.716697,71249.026117,58.912088,90,1.97917
2,HMM_4_3,-35860.717442,72580.125465,71921.434885,70.539474,75,1.883881


In [93]:
best_model_name = comparison.iloc[0]["model"]
best_model = models[best_model_name]

best_model.fit(df_feat)
regimes = best_model.predict(df_feat)
proba = best_model.predict_proba(df_feat)

# Get regime means
regime_means = best_model.get_regime_means()
regime_means_df = pd.DataFrame(
    regime_means,
        columns=df_feat.columns,
    index=[f'Regime_{i}' for i in range(best_model.n_regimes)] # type: ignore
)

print("\n=== Regime Characterization ===")
regime_means_df.round(2)

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'
Even though the 'means_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'm'


Even though the 'covars_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'c'



=== Regime Characterization ===


Unnamed: 0,T10Y3M_level_zscore_252,VIXCLS_level_zscore_63,NFCI_level_zscore_50,PCEPILFE_yoy_12_zscore_36,CFNAI_level_zscore_36
Regime_0,-0.26,-0.71,-1.39,0.87,0.06
Regime_1,0.89,0.27,1.46,-0.54,-1.09
Regime_2,-1.67,-0.01,0.88,0.7,0.11
Regime_3,0.72,0.06,-1.08,-0.46,0.35


In [94]:
from scipy import stats
import numpy as np

def test_regime_separation(features: pd.DataFrame, regimes: np.ndarray):
    """Test if feature distributions differ significantly across regimes."""
    results = []
    
    for col in features.columns:
        # Get feature values for each regime
        regime_groups = [features[col][regimes == i].values 
                        for i in range(regimes.max() + 1)]
        
        # ANOVA F-test: are means different?
        f_stat, p_value = stats.f_oneway(*regime_groups)
        
        # Kruskal-Wallis: non-parametric test for distributions
        h_stat, p_kruskal = stats.kruskal(*regime_groups)
        
        results.append({
            'feature': col,
            'f_statistic': f_stat,
            'p_value_anova': p_value,
            'kruskal_h': h_stat,
            'p_value_kruskal': p_kruskal,
            'significant': 'YES' if p_value < 0.01 else 'NO'
        })
    
    return pd.DataFrame(results).sort_values('p_value_anova')

# Run test
separation_test = test_regime_separation(df_feat, regimes)
print("\n=== Statistical Separation Test ===")
separation_test


=== Statistical Separation Test ===


Unnamed: 0,feature,f_statistic,p_value_anova,kruskal_h,p_value_kruskal,significant
0,T10Y3M_level_zscore_252,2042.225339,0.0,2976.095111,0.0,YES
2,NFCI_level_zscore_50,3522.821884,0.0,3997.124557,0.0,YES
4,CFNAI_level_zscore_36,522.72136,5.873709e-298,1010.625406,8.915499e-219,YES
3,PCEPILFE_yoy_12_zscore_36,498.476867,1.1039730000000001e-285,1259.125882,1.087947e-272,YES
1,VIXCLS_level_zscore_63,133.416828,2.338937e-83,377.325212,1.804329e-81,YES


In [95]:
trans_matrix = best_model.get_transition_matrix()
trans_df = pd.DataFrame(
    trans_matrix,
    index=[f'From_R{i}' for i in range(best_model.n_regimes)], # type: ignore
    columns=[f'To_R{i}' for i in range(best_model.n_regimes)] # type: ignore
)

print("\n=== Transition Probabilities ===")
print(trans_df.round(3))

# Persistence: diagonal elements (probability of staying in regime)
persistence = np.diag(trans_matrix)
expected_duration = 1 / (1 - persistence)  # in days

print("\n=== Expected Regime Duration (days) ===")
for i, dur in enumerate(expected_duration):
    print(f"Regime {i}: {dur:.1f} days")


=== Transition Probabilities ===
         To_R0  To_R1  To_R2  To_R3
From_R0  0.977  0.000  0.003  0.020
From_R1  0.000  0.992  0.003  0.005
From_R2  0.005  0.002  0.992  0.001
From_R3  0.011  0.004  0.003  0.982

=== Expected Regime Duration (days) ===
Regime 0: 44.1 days
Regime 1: 118.0 days
Regime 2: 133.0 days
Regime 3: 54.4 days


In [None]:
# Add regime labels to dataframe
df_regimes = df_feat.copy()
df_regimes['regime'] = regimes
df_regimes['regime_name'] = df_regimes['regime'].map({
    0: 'Late Cycle',
    1: 'Crisis',
    2: 'Goldilocks',
    3: 'Stagflation'
    # Adjust names based on your regime_means_df interpretation
}) # type: ignore

# Find major regime periods
regime_periods = []
current_regime = df_regimes['regime_name'][0]
start_date = df_feat.index[0]

for i, r in enumerate(df_regimes['regime_name'][1:], 1):
    if r != current_regime:
        regime_periods.append({
            'regime': current_regime,
            'start': start_date,
            'end': df_feat.index[i-1],
            'duration_days': (df_feat.index[i-1] - start_date).days
        })
        current_regime = r
        start_date = df_feat.index[i]

# Add last period
regime_periods.append({
    'regime': current_regime,
    'start': start_date,
    'end': df_feat.index[-1],
    'duration_days': (df_feat.index[-1] - start_date).days
})

periods_df = pd.DataFrame(regime_periods)
print("\n=== Major Regime Periods ===")
print(periods_df[periods_df['duration_days'] > 10])  # Filter short-lived regimes


=== Major Regime Periods ===
       regime      start        end  duration_days
0  Goldilocks 2004-09-01 2026-01-23           7814



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [97]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [106]:
from regime_ml.regimes import (
    plot_regime_timeseries,
    plot_regime_distributions,
    plot_regime_periods,
    plot_regime_confidence,
    create_regime_summary_table,
    plot_ticker_by_regime
)

# Load features and fit model
features = df_feat.copy()
best_model_name = comparison.iloc[0]["model"]
model = models[best_model_name]
model.fit(features)

# Get predictions
regimes = model.predict(features)

def smooth_regimes(regimes: np.ndarray, min_duration: int = 30) -> np.ndarray:
    """Remove regime periods shorter than min_duration days."""
    smoothed = regimes.copy()
    
    i = 0
    while i < len(smoothed):
        current_regime = smoothed[i]
        # Find next regime change
        j = i + 1
        while j < len(smoothed) and smoothed[j] == current_regime:
            j += 1
        
        duration = j - i
        
        # If too short, replace with neighboring regime
        if duration < min_duration and i > 0:
            smoothed[i:j] = smoothed[i-1]  # Use previous regime
        
        i = j
    
    return smoothed

# Apply
regimes = smooth_regimes(regimes, min_duration=30)

proba = model.predict_proba(features)

# Define regime names
regime_names = {
    0: 'Late Cycle',
    1: 'Crisis',
    2: 'Stagflation',
    3: 'Goldilocks'
}

# Create visualizations
fig1 = plot_ticker_by_regime("SPY", pd.to_datetime(features.index), regimes, regime_names)
fig1.show()

fig2 = plot_regime_timeseries(features, regimes, proba, regime_names)
fig2.show()

fig3 = plot_regime_distributions(features, regimes, regime_names)
fig3.show()

fig4 = plot_regime_periods(features, regimes, regime_names, min_duration_days=30)
fig4.show()

fig5 = plot_regime_confidence(features, regimes, proba, regime_names)
fig5.show()

# Create summary table
summary = create_regime_summary_table(
    regimes,
    model.get_regime_means(),
    list(features.columns),
    regime_names
)
summary

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'
Even though the 'means_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'm'


Even though the 'covars_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'c'


Downloading SPY data from 2004-09-01 to 2026-01-23...


Unnamed: 0,T10Y3M_level_zscore_252,VIXCLS_level_zscore_63,NFCI_level_zscore_50,PCEPILFE_yoy_12_zscore_36,CFNAI_level_zscore_36,Observations,Percentage,Avg_Duration
Late Cycle,-0.26,-0.71,-1.39,0.87,0.06,1067,19.9,106.7
Crisis,0.89,0.27,1.46,-0.54,-1.09,1094,20.41,182.33
Stagflation,-1.67,-0.01,0.88,0.7,0.11,1910,35.63,191.0
Goldilocks,0.72,0.06,-1.08,-0.46,0.35,1290,24.06,129.0
