In [1]:
import numpy as np
import pandas as pd
from scipy.linalg import cholesky

In [5]:
# Reproducibility
np.random.seed(42)

# Hyperparams
T = 2000  # time steps
N = 10   # number of assets
K = 3    # latent factors (Market, SMB, HML)

# === Generate Realistic Fama-French Factors ===

def generate_ff_factors(T):
    """Generate realistic Fama-French 3 factors"""
    
    # Factor correlation matrix (based on empirical FF correlations)
    factor_corr = np.array([
        [1.0,  0.2, -0.4],  # Market vs SMB, Market vs HML
        [0.2,  1.0, -0.2],  # SMB vs Market, SMB vs HML
        [-0.4, -0.2, 1.0]   # HML vs Market, HML vs SMB
    ])
    
    # Cholesky decomposition for correlated factors
    L = cholesky(factor_corr, lower=True)
    
    # Initialize factors
    factors = np.zeros((T, K))
    
    # === Factor 1: Market Factor (Rm-Rf) ===
    # AR(1) + GARCH(1,1) for market factor
    market_phi = 0.08  # slight persistence
    market_mu = 0.0008  # small positive drift (annualized ~20%)
    
    # GARCH parameters for market
    omega_mkt = 0.00001
    alpha_mkt = 0.1
    beta_mkt = 0.85
    
    # Generate market factor with GARCH volatility
    z_mkt = np.random.randn(T)
    sigma2_mkt = np.ones(T) * omega_mkt / (1 - alpha_mkt - beta_mkt)
    
    for t in range(1, T):
        sigma2_mkt[t] = omega_mkt + alpha_mkt * (factors[t-1, 0] - market_mu)**2 + beta_mkt * sigma2_mkt[t-1]
        factors[t, 0] = market_mu + market_phi * factors[t-1, 0] + np.sqrt(sigma2_mkt[t]) * z_mkt[t]
    
    # === Factor 2: SMB (Small Minus Big) ===
    # Mean-reverting process
    smb_kappa = 0.4  # mean reversion speed
    smb_theta = 0.0  # long-term mean
    smb_sigma = 0.003  # volatility
    
    for t in range(1, T):
        factors[t, 1] = factors[t-1, 1] + smb_kappa * (smb_theta - factors[t-1, 1]) + smb_sigma * np.random.randn()
    
    # === Factor 3: HML (High Minus Low) ===
    # Mean-reverting process with different parameters
    hml_kappa = 0.3  # slower mean reversion
    hml_theta = 0.0  # long-term mean
    hml_sigma = 0.004  # slightly higher volatility
    
    for t in range(1, T):
        factors[t, 2] = factors[t-1, 2] + hml_kappa * (hml_theta - factors[t-1, 2]) + hml_sigma * np.random.randn()
    
    # Apply correlation structure to innovations only (not cumulative factors)
    # Generate independent innovations first
    independent_innovations = np.random.randn(T, K)
    
    # Apply correlation structure
    correlated_innovations = independent_innovations @ L.T
    
    # Now build factors with proper dynamics
    for t in range(1, T):
        # Market factor with GARCH
        sigma2_mkt[t] = omega_mkt + alpha_mkt * (factors[t-1, 0] - market_mu)**2 + beta_mkt * sigma2_mkt[t-1]
        factors[t, 0] = market_mu + market_phi * factors[t-1, 0] + np.sqrt(sigma2_mkt[t]) * correlated_innovations[t, 0]
        
        # SMB factor (mean-reverting)
        factors[t, 1] = factors[t-1, 1] + smb_kappa * (smb_theta - factors[t-1, 1]) + smb_sigma * correlated_innovations[t, 1]
        
        # HML factor (mean-reverting)
        factors[t, 2] = factors[t-1, 2] + hml_kappa * (hml_theta - factors[t-1, 2]) + hml_sigma * correlated_innovations[t, 2]
    
    return factors

# Generate realistic FF factors
F = generate_ff_factors(T)

# === Generate Realistic Factor Loadings ===
def generate_ff_loadings(N, K):
    """Generate realistic Fama-French factor loadings"""
    B = np.zeros((N, K))
    
    # Market betas (all assets have market exposure)
    B[:, 0] = np.random.normal(1.0, 0.3, N)  # around 1.0 with variation
    B[:, 0] = np.clip(B[:, 0], 0.3, 2.0)  # realistic range
    
    # SMB loadings (some assets are small-cap tilted)
    B[:N//2, 1] = np.random.normal(0.5, 0.2, N//2)   # small cap assets
    B[N//2:, 1] = np.random.normal(-0.3, 0.2, N-N//2)  # large cap assets
    
    # HML loadings (some assets are value tilted)
    n_value = N//3
    n_growth = N//3
    n_neutral = N - n_value - n_growth  # remaining assets
    
    B[:n_value, 2] = np.random.normal(0.4, 0.2, n_value)     # value assets
    B[n_value:n_value+n_growth, 2] = np.random.normal(-0.2, 0.1, n_growth)  # growth assets
    B[n_value+n_growth:, 2] = np.random.normal(0.0, 0.1, n_neutral)   # neutral assets
    
    return B

# Generate factor loadings
B = generate_ff_loadings(N, K)

# Base linear signal
linear_signal = F @ B.T  # (T x N)

# === GARCH(1,1) noise generator (idiosyncratic risk) ===
def garch_noise(T, N, omega=0.00005, alpha=0.05, beta=0.90):
    """Generate GARCH(1,1) idiosyncratic noise"""
    z = np.random.randn(T, N)
    epsilon = np.zeros((T, N))
    sigma2 = np.ones((T, N)) * omega / (1 - alpha - beta)  # unconditional variance init

    for t in range(1, T):
        sigma2[t] = omega + alpha * epsilon[t-1]**2 + beta * sigma2[t-1]
        epsilon[t] = z[t] * np.sqrt(sigma2[t])
    
    return epsilon

# Generate GARCH noise (idiosyncratic risk)
epsilon = garch_noise(T, N)

# Combine linear signal + GARCH noise
X = linear_signal + epsilon

# Scale to realistic return magnitudes (daily returns)
X = X * 0.15  # scale to reasonable daily return range (~1.5% daily std)

# Wrap in DataFrames with proper names
X_df = pd.DataFrame(X, columns=[f"Asset_{i+1}" for i in range(N)])
F_df = pd.DataFrame(F, columns=["Market_Factor", "SMB_Factor", "HML_Factor"])
B_df = pd.DataFrame(B, columns=["Market_Beta", "SMB_Loading", "HML_Loading"])
B_df.index = [f"Asset_{i+1}" for i in range(N)]

# Save to CSVs
X_df.to_csv("assetReturns_ff3factor.csv", index=False)
F_df.to_csv("latentFactors_ff3factor.csv", index=False)
B_df.to_csv("factorLoadings_ff3factor.csv")

# Print summary statistics
print("=== Fama-French 3-Factor Synthetic Dataset Generated ===")
print("\nFactor Statistics:")
print(f"Market Factor - Mean: {F[:, 0].mean():.6f}, Std: {F[:, 0].std():.6f}")
print(f"SMB Factor - Mean: {F[:, 1].mean():.6f}, Std: {F[:, 1].std():.6f}")
print(f"HML Factor - Mean: {F[:, 2].mean():.6f}, Std: {F[:, 2].std():.6f}")

print("\nFactor Correlations:")
print(np.corrcoef(F.T))

print("\nAsset Return Statistics:")
print(f"Mean Return Range: [{X.mean(axis=0).min():.6f}, {X.mean(axis=0).max():.6f}]")
print(f"Volatility Range: [{X.std(axis=0).min():.6f}, {X.std(axis=0).max():.6f}]")

print("\nFiles saved:")
print("- assetReturns_ff3factor.csv")
print("- latentFactors_ff3factor.csv") 
print("- factorLoadings_ff3factor.csv")

=== Fama-French 3-Factor Synthetic Dataset Generated ===

Factor Statistics:
Market Factor - Mean: 0.000838, Std: 0.014867
SMB Factor - Mean: -0.000027, Std: 0.003672
HML Factor - Mean: -0.000269, Std: 0.005780

Factor Correlations:
[[ 1.          0.19164769 -0.26027148]
 [ 0.19164769  1.         -0.29147426]
 [-0.26027148 -0.29147426  1.        ]]

Asset Return Statistics:
Mean Return Range: [-0.000005, 0.000327]
Volatility Range: [0.004514, 0.005856]

Files saved:
- assetReturns_ff3factor.csv
- latentFactors_ff3factor.csv
- factorLoadings_ff3factor.csv
