In [7]:
!pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3


In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from hmmlearn.hmm import GMMHMM
from sklearn.preprocessing import StandardScaler

# Import Data

In [15]:
# URL for the processed feature set
url = "https://raw.githubusercontent.com/adisorn242/2026_WQU_CapstoneProject/main/df_feature_sup.csv"

# Import data with the 'Date' column as the index
df_feature_sup = pd.read_csv(url, index_col=0, parse_dates=True)

In [16]:
df_feature_sup.head()

Unnamed: 0_level_0,SET_return,SET_lag_1,SET_lag_2,SET_lag_3,SET_lag_4,SET_lag_5,SET_lag_6,SET_lag_7,SET_lag_8,SET_lag_9,...,SET_RSI_lag_1,SET_MFI_lag_1,SET_ATR_lag_1,SET_OBV_lag_1,LeadingEconomicIndex,AuthorizedCapitalofNewlyRegisteredCompanies,ConstructionAreasPermitted,Exports,NumberofForeignTourists,MoneySupply
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01,0.00695,0.001943,-0.00075,0.002534,-0.003005,0.010617,0.01125,-0.003735,0.000274,0.004063,...,-1.11452,-0.711824,0.193737,-1.929349,-0.501977,0.900089,-2.053201,2.215498,2.21506,-1.744855
2016-02-01,-0.007522,0.00695,0.001943,-0.00075,0.002534,-0.003005,0.010617,0.01125,-0.003735,0.000274,...,-0.988727,-1.094983,0.179503,-1.62726,-0.225405,0.022668,-2.254407,2.266478,2.336311,-1.695038
2016-03-01,0.000604,-0.007522,0.00695,0.001943,-0.00075,0.002534,-0.003005,0.010617,0.01125,-0.003735,...,-0.685232,-0.838704,-0.160081,-1.386491,-0.220596,-0.183632,-2.331124,2.428116,2.445791,-1.669858
2016-04-01,0.003895,0.000604,-0.007522,0.00695,0.001943,-0.00075,0.002534,-0.003005,0.010617,0.01125,...,-0.034616,-0.975211,-0.236821,-1.142654,-0.278315,-0.009397,-2.152183,2.297558,2.419504,-1.645469
2016-05-01,-0.002111,0.003895,0.000604,-0.007522,0.00695,0.001943,-0.00075,0.002534,-0.003005,0.010617,...,-0.059371,-1.567659,-0.235455,-1.499618,-0.636655,-0.503373,-2.300853,2.315076,2.37915,-1.60746


In [17]:
# Isolate the SET_return column and establish the training filter
df_regime = df_feature_sup[['SET_return']].copy()

# Create the training-specific dataframe
df_regime_train = df_regime.loc['2016-01-01':'2022-12-31'].copy()

# Univariate GMM-HMM

In [18]:
# Prepare the observation array
X_train = df_regime_train[['SET_return']].values

# Initialize and train the GMM-HMM
model = GMMHMM(
    n_components=3,
    n_mix=2,
    covariance_type="full",
    n_iter=1000,
    random_state=42
)
model.fit(X_train)

In [19]:
# Predict and assign labels to the training dataframe
df_regime_train['Regime'] = model.predict(X_train)

In [22]:
df_regime_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 84 entries, 2016-01-01 to 2022-12-01
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SET_return  84 non-null     float64
 1   Regime      84 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 2.0 KB


In [24]:
# Calculate descriptive statistics for each detected regime
regime_stats = df_regime_train.groupby('Regime')['SET_return'].agg(['mean', 'std', 'count'])

# Annualize the results (optional but helpful for thesis discussion)
# Assuming monthly data: Annual Mean = monthly_mean * 12
# Annual Volatility = monthly_std * sqrt(12)
regime_stats['annual_mean'] = regime_stats['mean'] * 12
regime_stats['annual_std'] = regime_stats['std'] * np.sqrt(12)

print("Training Regime Statistics (2016-2022)")
regime_stats

Training Regime Statistics (2016-2022)


Unnamed: 0_level_0,mean,std,count,annual_mean,annual_std
Regime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-0.005516,0.005469,14,-0.06619,0.018946
1,-0.000472,0.011551,17,-0.005666,0.040014
2,-7.3e-05,0.003374,53,-0.000878,0.011687


# Multivariate GMM-HMM

In [26]:
# 1. Define the list of features for the multivariate model
multivariate_features = ['SET_return', 'SET_OH_lag_1', 'SET_OL_lag_1', 'LeadingEconomicIndex']

# 2. Extract these features into a dedicated dataframe
df_regime_multi = df_feature_sup[multivariate_features].copy()

# 3. Filter for the training period (2016-2022)
df_regime_train_multi = df_regime_multi.loc['2016-01-01':'2022-12-31'].copy()

In [41]:
# 1. Standardize the features (Crucial to fix Null Eigenvalues)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_regime_train_multi[multivariate_features])

# 2. Re-initialize with more stable covariance settings
# covariance_type='diag' is often more stable for small datasets (84 points)
# than 'full' if you encounter null eigenvalues.
model_multi = GMMHMM(
    n_components=3,
    n_mix=2,
    covariance_type="diag",
    n_iter=2000,
    random_state=42
)

# 3. Fit on the scaled data
model_multi.fit(X_train_scaled)

# 4. Check convergence again
if model_multi.monitor_.converged:
    print(f"Success! Model converged in {model_multi.monitor_.iter} iterations.")
else:
    print("Still not converging. We may need to reduce n_mix or n_components.")

Success! Model converged in 30 iterations.


In [42]:
# 1. Calculate Mean and SD specifically for SET_return by Regime
set_regime_stats = df_regime_train_multi.groupby('Regime')['SET_return'].agg(['mean', 'std', 'count'])

# 2. Annualize for thesis interpretation (assuming monthly data)
set_regime_stats['annual_mean'] = set_regime_stats['mean'] * 12
set_regime_stats['annual_std'] = set_regime_stats['std'] * (12 ** 0.5)

print("Training Regime SET_return Statistics (2016-2022)")
print(set_regime_stats)

Training Regime SET_return Statistics (2016-2022)
            mean       std  count  annual_mean  annual_std
Regime                                                    
0      -0.000782  0.004093     25    -0.009380    0.014180
1       0.000563  0.004466     24     0.006756    0.015471
2      -0.002374  0.008514     35    -0.028490    0.029493


In [43]:
# 1. Extract the raw transition probabilities
# Rows represent 'From State' and Columns represent 'To State'
raw_transmat = model_multi.transmat_

# 2. Map the labels based on your confirmed identification:
# 1=Bull, 0=Sideway, 2=Bear
state_names = {1: 'Bull (1)', 0: 'Sideway (0)', 2: 'Bear (2)'}
index_names = [state_names[0], state_names[1], state_names[2]]

# 3. Create a formatted DataFrame for your dissertation
df_transmat = pd.DataFrame(
    raw_transmat,
    index=[f"From {name}" for name in index_names],
    columns=[f"To {name}" for name in index_names]
)

print("Multivariate Transition Matrix (2016-2022)")
print(df_transmat.round(4))

Multivariate Transition Matrix (2016-2022)
                  To Sideway (0)  To Bull (1)  To Bear (2)
From Sideway (0)            0.48       0.5200       0.0000
From Bull (1)               1.00       0.0000       0.0000
From Bear (2)               0.00       0.0196       0.9804


In [44]:
# 1. Define the mapping based on our validated statistics
regime_map = {0: 'sw', 1: 'bull', 2: 'bear'}

# 2. Create the new label column
df_regime_train_multi['Regime_Label'] = df_regime_train_multi['Regime'].map(regime_map)

In [45]:
df_regime_train_multi

Unnamed: 0_level_0,SET_return,SET_OH_lag_1,SET_OL_lag_1,LeadingEconomicIndex,Regime,Regime_Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-01,0.006950,0.007224,-0.000179,-0.501977,0,sw
2016-02-01,-0.007522,0.006973,-0.003676,-0.225405,1,bull
2016-03-01,0.000604,0.001124,-0.008123,-0.220596,0,sw
2016-04-01,0.003895,0.004305,-0.003446,-0.278315,1,bull
2016-05-01,-0.002111,0.003895,-0.003508,-0.636655,0,sw
...,...,...,...,...,...,...
2022-08-01,0.003674,0.012553,-0.000944,0.363811,2,bear
2022-09-01,-0.000296,0.006817,-0.001784,0.481655,2,bear
2022-10-01,-0.003661,0.004681,-0.003629,0.474440,2,bear
2022-11-01,0.005654,0.002734,-0.005527,0.368621,2,bear


# Export Data

In [46]:
# 1. Define the columns to export
export_cols = ['Regime', 'Regime_Label']

# 2. Export to CSV
# Using the index=True to keep the Date information for alignment
df_regime_train_multi[export_cols].to_csv('training_regime_labels.csv', index=True)