# Market Regime Detection: HMM vs Wasserstein Clustering



## Install Required Libraries

In [None]:
!pip install yfinance hmmlearn numpy pandas matplotlib scipy scikit-learn

## Part 1: Hidden Markov Models (HMM) for Regime Detection

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from hmmlearn.hmm import GaussianHMM
import matplotlib.pyplot as plt

# Download S&P 500 data
data = yf.download('SPY', start='2015-3-05', end='2025-03-20')
returns = np.log(data['Close'] / data['Close'].shift(1)).dropna()

# Fit a 2-state HMM
model = GaussianHMM(n_components=2, covariance_type="full", n_iter=200, random_state=42)
model.fit(returns.values.reshape(-1, 1))

# Predict hidden states
states = model.predict(returns.values.reshape(-1, 1))

# Visualize
plt.figure(figsize=(12, 6))
for state in range(2):
    mask = (states == state)
    plt.plot(returns.index[mask], data['Close'].loc[returns.index[mask]], 
             '.', markersize=4, label=f'Regime {state}')
plt.legend()
plt.title('HMM-Based Market Regime Detection')
plt.ylabel('SPY Price')
plt.show()

# Print regime characteristics
for state in range(2):
    regime_returns = returns.values[states == state]
    print(f"Regime {state}: Mean={regime_returns.mean():.4f}, Std={regime_returns.std():.4f}")

## Additional Analysis: HMM Transition Probabilities

In [None]:
# Print transition matrix
print("\nTransition Matrix:")
print(model.transmat_)
print("\nInterpretation:")
print(f"Probability of staying in Regime 0: {model.transmat_[0, 0]:.4f}")
print(f"Probability of switching from Regime 0 to 1: {model.transmat_[0, 1]:.4f}")
print(f"Probability of staying in Regime 1: {model.transmat_[1, 1]:.4f}")
print(f"Probability of switching from Regime 1 to 0: {model.transmat_[1, 0]:.4f}")

# Print regime means and variances
print("\nRegime Parameters:")
for state in range(2):
    print(f"Regime {state}:")
    print(f"  Mean (μ): {model.means_[state][0]:.6f}")
    print(f"  Std Dev (σ): {np.sqrt(model.covars_[state][0][0]):.6f}")

## Part 2: Wasserstein Distance-Based Regime Detection

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from scipy.stats import wasserstein_distance
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Download data
data = yf.download('SPY', start='2015-01-01', end='2025-01-01')
returns = np.log(data['Close'] / data['Close'].shift(1)).dropna().values

# Create rolling windows
window_size = 20
step_size = 10
segments = []
segment_dates = []

for i in range(0, len(returns) - window_size, step_size):
    segments.append(returns[i:i + window_size])
    # Store the end date of each window for plotting
    segment_dates.append(data.index[i + window_size])

# Compute Wasserstein distance matrix
n_segments = len(segments)
distance_matrix = np.zeros((n_segments, n_segments))

for i in range(n_segments):
    for j in range(i + 1, n_segments):
        dist = wasserstein_distance(segments[i], segments[j])
        distance_matrix[i, j] = dist
        distance_matrix[j, i] = dist

# Embed into 2D space using MDS (to visualize)
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
embedding = mds.fit_transform(distance_matrix)

# Cluster in embedded space
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embedding)

# Visualize clusters
plt.figure(figsize=(12, 6))
colors = ['blue', 'red', 'green', 'orange']

for cluster in range(n_clusters):
    cluster_mask = (labels == cluster)
    cluster_dates = [segment_dates[i] for i in range(len(segment_dates)) if cluster_mask[i]]
    cluster_prices = data['Close'].loc[cluster_dates]
    plt.scatter(cluster_dates, cluster_prices, 
                c=colors[cluster], s=10, alpha=0.6, label=f'Regime {cluster}')

plt.legend()
plt.title('Market Regime Detection using Wasserstein Distance')
plt.ylabel('SPY Price')
plt.xlabel('Date')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print cluster statistics
for cluster in range(n_clusters):
    cluster_segments = [segments[i] for i in range(len(segments)) if labels[i] == cluster]
    all_returns = np.concatenate(cluster_segments)
    print(f"Regime {cluster}: Mean={all_returns.mean():.4f}, Std={all_returns.std():.4f}, "
          f"Count={len(cluster_segments)} windows")

## Additional Visualization: 2D Embedding of Regime Clusters

In [None]:
# Visualize the MDS embedding
plt.figure(figsize=(10, 6))
scatter = plt.scatter(embedding[:, 0], embedding[:, 1], 
                     c=labels, cmap='viridis', s=50, alpha=0.6)
plt.colorbar(scatter, label='Cluster')
plt.title('2D MDS Embedding of Market Regimes (Wasserstein Distance)')
plt.xlabel('MDS Dimension 1')
plt.ylabel('MDS Dimension 2')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Comparison: Distribution Shapes by Regime

In [None]:
# Compare distribution shapes for each regime
fig, axes = plt.subplots(1, n_clusters, figsize=(14, 5))

for cluster in range(n_clusters):
    cluster_segments = [segments[i] for i in range(len(segments)) if labels[i] == cluster]
    all_returns = np.concatenate(cluster_segments)
    
    axes[cluster].hist(all_returns, bins=50, alpha=0.7, color=colors[cluster], edgecolor='black')
    axes[cluster].axvline(all_returns.mean(), color='red', linestyle='--', linewidth=2, label='Mean')
    axes[cluster].set_title(f'Regime {cluster} Return Distribution')
    axes[cluster].set_xlabel('Log Returns')
    axes[cluster].set_ylabel('Frequency')
    axes[cluster].legend()
    axes[cluster].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nDetailed Regime Statistics:")
for cluster in range(n_clusters):
    cluster_segments = [segments[i] for i in range(len(segments)) if labels[i] == cluster]
    all_returns = np.concatenate(cluster_segments)
    print(f"\nRegime {cluster}:")
    print(f"  Mean: {all_returns.mean():.6f}")
    print(f"  Std Dev: {all_returns.std():.6f}")
    print(f"  Skewness: {pd.Series(all_returns).skew():.6f}")
    print(f"  Kurtosis: {pd.Series(all_returns).kurtosis():.6f}")
    print(f"  Min: {all_returns.min():.6f}")
    print(f"  Max: {all_returns.max():.6f}")
    print(f"  Windows: {len(cluster_segments)}")

## Experiment: Different Numbers of Regimes

In [None]:
# Try different numbers of clusters
from sklearn.metrics import silhouette_score

k_range = range(2, 6)
silhouette_scores = []
inertias = []

for k in k_range:
    kmeans_k = KMeans(n_clusters=k, random_state=42)
    labels_k = kmeans_k.fit_predict(embedding)
    silhouette_scores.append(silhouette_score(embedding, labels_k))
    inertias.append(kmeans_k.inertia_)

# Plot elbow curve and silhouette scores
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(k_range, inertias, marker='o', linewidth=2)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True, alpha=0.3)

axes[1].plot(k_range, silhouette_scores, marker='o', linewidth=2, color='orange')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nOptimal number of clusters:")
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Based on silhouette score: k = {optimal_k}")

## Side-by-Side Comparison: HMM vs Wasserstein

In [None]:
# Create a comparison plot
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# HMM results
for state in range(2):
    mask = (states == state)
    axes[0].plot(returns.index[mask], data['Close'].loc[returns.index[mask]], 
                 '.', markersize=3, label=f'HMM Regime {state}', alpha=0.6)
axes[0].set_title('HMM-Based Market Regime Detection')
axes[0].set_ylabel('SPY Price')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Wasserstein results
for cluster in range(n_clusters):
    cluster_mask = (labels == cluster)
    cluster_dates = [segment_dates[i] for i in range(len(segment_dates)) if cluster_mask[i]]
    cluster_prices = data['Close'].loc[cluster_dates]
    axes[1].scatter(cluster_dates, cluster_prices, 
                    c=colors[cluster], s=10, alpha=0.6, label=f'Wasserstein Regime {cluster}')
axes[1].set_title('Wasserstein Distance-Based Market Regime Detection')
axes[1].set_ylabel('SPY Price')
axes[1].set_xlabel('Date')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()