In [1]:
# imports 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from scipy.linalg import orthogonal_procrustes
from sklearn.decomposition import PCA
import os
import random
from torch.utils.data import DataLoader

In [9]:
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [10]:
# theorem computation
def compute_optimal_A_b_mu(X_np, r):
    mu = np.mean(X_np, axis=0)
    cov = (X_np - mu).T @ (X_np - mu)
    U, S, _ = np.linalg.svd(cov)
    Ur = U[:, :r]  
    
    A = Ur @ Ur.T  
    b = np.zeros(r)  
    
    return A, b, mu, Ur

In [11]:
# plain linear autoencoder 
class ClassicAffineAutoencoder(nn.Module):
    def __init__(self, input_dim, r):
        super().__init__()
        self.input_dim = input_dim
        self.r = r
        
        self.projection = nn.Linear(input_dim, input_dim, bias=True)
        
        self.decoder = nn.Linear(r, input_dim, bias=True)
        
    def encoder(self, x):
        projected = self.projection(x)  
        return projected[:, :self.r]  

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

In [12]:
# optimal autoencoder 
class OptimalAffineAutoencoder(nn.Module):
    def __init__(self, input_dim, r, Ur, mu):
        super().__init__()
        self.Ur = torch.tensor(Ur, dtype=torch.float32)  
        self.mu = torch.tensor(mu, dtype=torch.float32)

    def encoder(self, x):
        x_centered = x - self.mu
        return x_centered @ self.Ur  

    def decoder(self, z):
        return z @ self.Ur.T + self.mu  

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)


In [56]:
# nonlinear autoencoder
class NonlinearAutoencoder(nn.Module):
    def __init__(self, input_dim, bottleneck_dim, hidden_dim=5):
        super().__init__()
        
        # encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, bottleneck_dim)
        )
        
        # decoder:
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat
    

In [14]:
# autoencoder training loop
def train_autoencoder(model, train_loader, val_loader, num_epochs, lr=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            recon = model(batch)
            loss = criterion(recon, batch)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item() * batch.size(0)

        avg_train_loss = total_train_loss / len(train_loader.dataset)
        train_losses.append(avg_train_loss)

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                recon = model(batch)
                loss = criterion(recon, batch)
                total_val_loss += loss.item() * batch.size(0)

        avg_val_loss = total_val_loss / len(val_loader.dataset)
        val_losses.append(avg_val_loss)


    return model, train_losses, val_losses

In [15]:
# validation function only code for the optimal affine autoencoder
def valOnlyOptimalAffineAutoencoder(model, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    model.eval()
    total_loss = 0
    criterion = nn.MSELoss()

    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            recon = model(batch)
            loss = criterion(recon, batch)
            total_loss += loss.item() * batch.size(0)

    avg_loss = total_loss / len(val_loader.dataset)
    return avg_loss 

In [None]:
# optimal and PCA and nonlinear models + OPA + MSE and factor analysis 
X_df = pd.read_csv("assetReturns_garch.csv")
X_np = X_df.to_numpy().astype(np.float32)
X_tensor = torch.tensor(X_np)

print(f"Data shape: {X_np.shape}")

# set dims and latent space size 
input_dim = X_np.shape[1]
r = 3  

# compute the optimal params 
A, b, mu, Ur = compute_optimal_A_b_mu(X_np, r)


results = {
    'optimal_mse': [],    
    'optimal_factors': [],  
}

#split 
n_samples = X_tensor.shape[0]
train_size = int(0.8 * n_samples)
val_size = n_samples - train_size

#slice
train_data = X_tensor[:train_size]
train_indices = train_data.indices
val_data = X_tensor[train_size:]
val_indices = np.arange(train_size, len(X_np))
val_dates = X_df.index[val_indices]

# create the data loaders
g = torch.Generator()
g.manual_seed(seed)
batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    

model_optimal = OptimalAffineAutoencoder(input_dim, 3, Ur, mu)
losses_optimal_val = valOnlyOptimalAffineAutoencoder(model_optimal, val_loader) 

results['optimal_mse'].append(losses_optimal_val)

X_train_np = train_data.numpy()
X_val_np = val_data.numpy()

# load ground truth latent factors and slice val rows
F_true_full = pd.read_csv("latentFactors_garch.csv").to_numpy().astype(np.float32)
F_true_tensor = torch.tensor(F_true_full[val_indices])

 # PCA
pca = PCA(n_components=r)
pca.fit(X_train_np)


X_val_recon = pca.inverse_transform(pca.transform(X_val_np))

pca_mse = np.mean((X_val_np - X_val_recon) ** 2)
    
Z_pca = pca.transform(X_val_np)

F = F_true_tensor.numpy()
R_pca, _ = orthogonal_procrustes(Z_pca, F)
Z_pca_aligned = Z_pca @ R_pca


corr_pca = np.abs([
        np.corrcoef(Z_pca_aligned[:, i], F[:, i])[0, 1]
        for i in range(F.shape[1])
])
    
X_val_reconstructed = pca.inverse_transform(Z_pca)
mse_pca = np.mean((X_val_np - X_val_reconstructed) ** 2)

results.setdefault('pca_mse', []).append(mse_pca)
results.setdefault('pca_factors', []).append(corr_pca)


val_tensor = X_tensor[val_indices]

def aligned_corr(model, X_val, F_true_val):
    with torch.no_grad():
        Z = model.encoder(X_val).cpu().numpy()
        F = F_true_val.cpu().numpy()
    R, _ = orthogonal_procrustes(Z, F)
    Z_aligned = Z @ R
    return np.abs([
        np.corrcoef(Z_aligned[:, i], F[:, i])[0, 1]
        for i in range(F.shape[1])
    ])

# compute and store factor correlations
results['optimal_factors'].append(
    aligned_corr(model_optimal, val_tensor, F_true_tensor)
)


def print_results_summary(results):

    optimal_mse = np.array(results['optimal_mse'])
    pca_mse = np.array(results['pca_mse'])  

    print(f"\nOptimal Autoencoder:")
    print(f" MSE: {optimal_mse.mean():.8f} ")

    print(f"\nPCA Baseline:")
    print(f"MSE: {pca_mse.mean():.8f} ")

    
# factor analysis results
    print(f"\n FACTOR RECOVERY ANALYSIS")

    optimal_factors = np.array(results['optimal_factors'])
    pca_factors = np.array(results['pca_factors'])  
    

    print(f"\nPer-Factor Correlations:")
    print(f"{'Factor':<8} {'Optimal':<10}{'PCA':<10}")
    
    for i in range(optimal_factors.shape[1]):
        factor_optimal = optimal_factors[:, i].mean()
        factor_pca = pca_factors[:, i].mean()
        print(f"{i+1:<8} {factor_optimal:<10.4f} {factor_pca:<10.4f}")

print_results_summary(results)  

Data shape: (2000, 10)

Optimal Autoencoder:
 MSE: 0.00001494 

PCA Baseline:
MSE: 0.00001562 

 FACTOR RECOVERY ANALYSIS

Per-Factor Correlations:
Factor   Optimal   PCA       
1        0.8052     0.8057    
2        0.1584     0.0886    
3        0.1030     0.0613    


In [None]:
# classic and nonlinear models in 100 run loop + OPA + MSE and factor analysis 
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

results = {
    'linear_train_mse': [],
    'linear_val_mse': [],
    'linear_factors': [],
    'linear_analysis': [],
    'nonlinear_train_mse': [],
    'nonlinear_val_mse': [],
    'nonlinear_factors': [],
    'nonlinear_analysis': []
}

val_tensor = X_tensor[val_indices]
for run in range(100):
   # progress meter
    if (run + 1) % 10 == 0:
        print(f"Completed run {run + 1}/100")
    
    # init/train model
    modellinear = ClassicAffineAutoencoder(input_dim, r).to(device)
    modelNonlinear = NonlinearAutoencoder(input_dim, r).to(device)
    modellinear, train_losslinear, val_losslinear = train_autoencoder(
        modellinear, train_loader, val_loader, num_epochs=150, lr=0.001)
    results['linear_train_mse'].append(train_losslinear[-1])  
    results['linear_val_mse'].append(val_losslinear[-1])    
    modelNonlinear, train_lossNonlinear, val_lossNonlinear = train_autoencoder(
        modelNonlinear, train_loader, val_loader, num_epochs=100, lr=0.001) 
    results['nonlinear_train_mse'].append(train_lossNonlinear[-1])  
    results['nonlinear_val_mse'].append(val_lossNonlinear[-1]) 
    

    linear_factors = modellinear.encoder(val_data).detach().cpu().numpy()
    results['linear_factors'].append(linear_factors)

    nonlinear_factors = modelNonlinear.encoder(val_data).detach().cpu().numpy()
    results['nonlinear_factors'].append(nonlinear_factors)
    
    
    # compute and store factor correlations 
    results['linear_analysis'].append(
        aligned_corr(modellinear, val_tensor, F_true_tensor)
    )

    results['nonlinear_analysis'].append(
        aligned_corr(modelNonlinear, val_tensor, F_true_tensor)
    )

print(f"  Linear Train MSE:    {np.mean(results['linear_train_mse']):.8f} ± {np.std(results['linear_train_mse']):.8f}")
print(f"  Linear Val MSE:      {np.mean(results['linear_val_mse']):.8f} ± {np.std(results['linear_val_mse']):.8f}")
print(f"  Nonlinear Train MSE:    {np.mean(results['nonlinear_train_mse']):.8f} ± {np.std(results['nonlinear_train_mse']):.8f}")
print(f"  Nonlinear Val MSE:      {np.mean(results['nonlinear_val_mse']):.8f} ± {np.std(results['nonlinear_val_mse']):.8f}")


def print_results_summary(results):
    # factor analysis results
    linear_factors = np.array(results['linear_analysis'])  
    nonlinear_factors = np.array(results['nonlinear_analysis'])
     
    
    print(f"\nPer-Factor Correlations (mean across runs):")
    print(f"{'Factor':<8} {'Linear':<12} {'Nonlinear':<12}")
    
    for i in range(linear_factors.shape[1]):
        factor_linear_mean = linear_factors[:, i].mean()
        factor_linear_std = linear_factors[:, i].std()
        factor_nonlinear_mean = nonlinear_factors[:,i].mean()
        factor_nonlinear_std = nonlinear_factors[:,i].std()

        print(f"{i+1:<8} {factor_linear_mean:.4f}±{factor_linear_std:.4f} {factor_nonlinear_mean:.4f}+{factor_nonlinear_std:.4f}")

print_results_summary(results)

Completed run 10/100
Completed run 20/100
Completed run 30/100
Completed run 40/100
Completed run 50/100
Completed run 60/100
Completed run 70/100
Completed run 80/100
Completed run 90/100
Completed run 100/100
  Linear Train MSE:    0.00001873 ± 0.00000091
  Linear Val MSE:      0.00001948 ± 0.00000110
  Nonlinear Train MSE:    0.00002518 ± 0.00000130
  Nonlinear Val MSE:      0.00002623 ± 0.00000144

Per-Factor Correlations (mean across runs):
Factor   Linear       Nonlinear   
1        0.4118±0.2126 0.4152+0.2298
2        0.1214±0.0569 0.1118+0.0602
3        0.0875±0.0551 0.1156+0.0650
