In [9]:
import os 
os.environ['PYTHONHASHSEED'] = '42'
import yfinance as yf
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.decomposition import PCA
import requests, io
from zipfile import ZipFile
import random

In [10]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)

In [11]:
class EnhancedStockDataFramework:
    def __init__(self, start_date="2015-01-01", end_date="2024-12-31"):
        self.start_date = start_date
        self.end_date = end_date
        self.raw_data = None
        self.processed_data = None
        self.failed_downloads = []
        self.successful_tickers = []
        
    def get_stock_universe(self):
        """Define stock universes with current, active tickers"""
         # Comprehensive universe (200 stocks)
        tickers = [
                # tech
                'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA', 'NFLX', 'ADBE', 'CRM',
                'ORCL', 'INTC', 'AMD', 'QCOM', 'AVGO', 'TXN', 'AMAT', 'LRCX', 'KLAC', 'MRVL',
                
                # finance
                'JPM', 'BAC', 'WFC', 'C', 'GS', 'MS', 'BK', 'USB', 'PNC', 'TFC',
                'COF', 'AXP', 'V', 'MA', 'PYPL', 'BRK-B', 'BLK', 'SPGI', 'ICE', 'CME',
                
                # health and pharma 
                'JNJ', 'PFE', 'UNH', 'ABT', 'TMO', 'DHR', 'BMY', 'MRK', 'ABBV', 'AMGN',
                'GILD', 'BIIB', 'REGN', 'VRTX', 'ILMN', 'ISRG', 'SYK', 'BSX', 'MDT', 'EW',
                
                # consumer discretionary 
                'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'SBUX', 'TJX', 'LOW', 'BKNG', 'ABNB',
                'DIS', 'NFLX', 'CMCSA', 'VZ', 'T', 'CHTR', 'TMUS', 'ROKU', 'SPOT', 'UBER',
                
                # consumer staples
                'PG', 'KO', 'PEP', 'WMT', 'COST', 'CL', 'KMB', 'GIS', 'K', 'CPB',
                'TSN', 'HRL', 'SJM', 'CAG', 'MKC', 'CLX', 'CHD', 'EL', 'ULTA', 'TGT',
                
                # industrial and manufacturing
                'BA', 'CAT', 'DE', 'GE', 'HON', 'UPS', 'FDX', 'LMT', 'RTX', 'NOC',
                'MMM', 'EMR', 'ETN', 'ITW', 'PH', 'CMI', 'ROK', 'DOV', 'FTV', 'XYL',
                
                # energy and utilities
                'XOM', 'CVX', 'COP', 'EOG', 'PXD', 'VLO', 'MPC', 'PSX', 'KMI', 'OKE',
                'NEE', 'DUK', 'SO', 'AEP', 'EXC', 'XEL', 'PEG', 'ED', 'AWK', 'ATO',
                
                # materials and basic industries 
                'LIN', 'APD', 'ECL', 'SHW', 'FCX', 'NEM', 'FMC', 'LYB', 'DD', 'DOW',
                'PPG', 'NUE', 'STLD', 'MLM', 'VMC', 'PKG', 'IP', 'WRK', 'SON', 'AVY',
                
                # real estate and REITs
                'AMT', 'PLD', 'CCI', 'EQIX', 'SPG', 'O', 'WELL', 'AVB', 'EQR', 'DLR',
                'BXP', 'VTR', 'ARE', 'MAA', 'UDR', 'CPT', 'KIM', 'REG', 'FRT', 'BRX',
                
                # communication services
                'GOOGL', 'META', 'NFLX', 'DIS', 'CMCSA', 'VZ', 'T', 'CHTR', 'TMUS', 'TWTR',
                'SNAP', 'PINS', 'MTCH', 'IAC', 'FOXA', 'PARA', 'WBD', 'LUMN', 'SIRI', 'DISH',
                
                # emerging growth and others
                'SHOP', 'ZM', 'DOCU', 'OKTA', 'SNOW', 'PLTR', 'RBLX', 'HOOD', 'COIN', 'RIVN'
            ]
        # Remove duplicates and return unique tickers
        return list(set(tickers))
    
    def download_data(self, tickers, max_retries=3):
        print(f"Downloading data for {len(tickers)} stocks...")
        print(f"Date range: {self.start_date} to {self.end_date}")
        
        successful_data = {}
        failed_downloads = []
        
        batch_size = 10
        for i in range(0, len(tickers), batch_size):
            batch = tickers[i:i+batch_size]
            
            for ticker in batch:
                success = False
                for attempt in range(max_retries):
                    try:
                        # download each ticker 
                        stock = yf.Ticker(ticker)
                        hist = stock.history(start=self.start_date, end=self.end_date)
                        
                        if not hist.empty and len(hist) > 252: 
                            successful_data[ticker] = hist['Close']
                            success = True
                            break
                        else:
                            print(f"{ticker} has insufficient data ({len(hist)} days)")
                            
                    except Exception as e:
                        print(f"Attempt {attempt + 1} failed for {ticker}: {str(e)}")
                        continue
                
                if not success:
                    failed_downloads.append(ticker)
            
            # Progress indicator
            print(f"Processed {min(i + batch_size, len(tickers))} of {len(tickers)} stocks")
        
        self.failed_downloads = failed_downloads
        self.successful_tickers = list(successful_data.keys())
        
        if successful_data:
            # Convert to DataFrame
            self.raw_data = pd.DataFrame(successful_data)
            print(f"\nSuccessfully downloaded {len(successful_data)} stocks")
            print(f"Failed downloads: {len(failed_downloads)}")
            if failed_downloads:
                print(f"Failed tickers: {failed_downloads[:10]}...")  # Show first 10
        else:
            print("no data downloaded")
            return False
        
        return True
    
    def clean_data(self, min_trading_days=252, max_zero_days=5):
        # Clean and filter the data
        if self.raw_data is None or self.raw_data.empty:
            print("no raw data")
            return False
        
        print(f"Initial data shape: {self.raw_data.shape}")
        
        # remove stocks with insufficient data
       #initial_count = len(self.raw_data.columns)
        self.raw_data = self.raw_data.dropna(thresh=min_trading_days, axis=1)
        print(f"After removing stocks with insufficient data: {self.raw_data.shape}")
        
        # remove stocks with too many zero-price days
        zero_counts = (self.raw_data <= 0).sum()
        valid_stocks = zero_counts[zero_counts <= max_zero_days].index
        self.raw_data = self.raw_data[valid_stocks]
        print(f"After removing stocks with excessive zero prices: {self.raw_data.shape}")
        
        # forward fill missing values (up to 5 days)
        self.raw_data = self.raw_data.fillna(method='ffill', limit=5)
        
        # remove any remaining NaN values
        self.raw_data = self.raw_data.dropna()
        
        if self.raw_data.empty:
            print("dataset is empty")
            return False
        
        print(f"cleaned data shape: {self.raw_data.shape}")
        return True
    
    def calculate_returns(self):
        # calculate daily returns and basic statistics
        if self.raw_data is None or self.raw_data.empty:
            print("no data available")
            return False
        
        # calculate daily returns
        returns = self.raw_data.pct_change().dropna()
        
        # remove extreme outliers 
        extreme_threshold = 0.5
        returns_clean = returns.copy()
        
        # cap extreme returns
        returns_clean = returns_clean.clip(lower=-extreme_threshold, upper=extreme_threshold)
        
        # calculate basic statistics
        mean_returns = returns_clean.mean()
        volatility = returns_clean.std() * np.sqrt(252)  
        
        # create summary statistics
        self.processed_data = {
            'returns': returns_clean,
            'prices': self.raw_data,
            'mean_daily_return': mean_returns.mean(),
            'mean_daily_volatility': volatility.mean(),
            'num_stocks': len(returns_clean.columns),
            'num_days': len(returns_clean),
            'date_range': f"{returns_clean.index[0].strftime('%Y-%m-%d')} to {returns_clean.index[-1].strftime('%Y-%m-%d')}"
        }
        
        return True
    
    def generate_summary(self):
        # generate a comprehensive summary of the dataset
        if self.processed_data is None:
            print("no processed data available")
            return
        
        print(f"downloads: {self.processed_data['num_stocks']} stocks")
        print(f"failed downloads: {len(self.failed_downloads)} stocks")
        print(f"date range: {self.processed_data['date_range']}")
        print(f"trading days: {self.processed_data['num_days']}")
        
        print(f"\n metrics:")
        print(f"Mean daily return: {self.processed_data['mean_daily_return']:.6f}")
        print(f"Mean daily volatility: {self.processed_data['mean_daily_volatility']:.4f}")
        
        if self.failed_downloads:
            print(f"\n Failed tickers: {self.failed_downloads[:10]}")
    
    def run_complete_pipeline(self):
        tickers = self.get_stock_universe()
        
        if not self.download_data(tickers):
            print("no data downloaded")
            return False
        
        if not self.clean_data():
            print("no data remaining after cleaning")
            return False
        
        if not self.calculate_returns():
            print("no data available")
            return False
        
        # Generate summary
        self.generate_summary()
        
        return True

if __name__ == "__main__":
    framework = EnhancedStockDataFramework(
        start_date="2015-01-01",
        end_date="2024-12-31"
    )
    
    # run the complete pipeline
    success = framework.run_complete_pipeline()

Downloading data for 198 stocks...
Date range: 2015-01-01 to 2024-12-31
Processed 10 of 198 stocks
Processed 20 of 198 stocks
Processed 30 of 198 stocks
Processed 40 of 198 stocks
Processed 50 of 198 stocks
Processed 60 of 198 stocks
Processed 70 of 198 stocks
Processed 80 of 198 stocks
Processed 90 of 198 stocks
Processed 100 of 198 stocks
Processed 110 of 198 stocks
Processed 120 of 198 stocks
Processed 130 of 198 stocks
Processed 140 of 198 stocks


$WRK: possibly delisted; no timezone found
$WRK: possibly delisted; no timezone found
$WRK: possibly delisted; no timezone found


WRK has insufficient data (0 days)
WRK has insufficient data (0 days)
WRK has insufficient data (0 days)
Processed 150 of 198 stocks
Processed 160 of 198 stocks


$PXD: possibly delisted; no timezone found
$PXD: possibly delisted; no timezone found
$PXD: possibly delisted; no timezone found


PXD has insufficient data (0 days)
PXD has insufficient data (0 days)
PXD has insufficient data (0 days)


$TWTR: possibly delisted; no price data found  (1d 2015-01-01 -> 2024-12-31)
$TWTR: possibly delisted; no price data found  (1d 2015-01-01 -> 2024-12-31)
$TWTR: possibly delisted; no price data found  (1d 2015-01-01 -> 2024-12-31)


Processed 170 of 198 stocks
TWTR has insufficient data (0 days)
TWTR has insufficient data (0 days)
TWTR has insufficient data (0 days)
Processed 180 of 198 stocks
Processed 190 of 198 stocks


$DISH: possibly delisted; no timezone found
$DISH: possibly delisted; no timezone found
$DISH: possibly delisted; no timezone found


DISH has insufficient data (0 days)
DISH has insufficient data (0 days)
DISH has insufficient data (0 days)
Processed 198 of 198 stocks

Successfully downloaded 194 stocks
Failed downloads: 4
Failed tickers: ['WRK', 'PXD', 'TWTR', 'DISH']...
Initial data shape: (2515, 194)
After removing stocks with insufficient data: (2515, 194)
After removing stocks with excessive zero prices: (2515, 194)
cleaned data shape: (788, 194)
downloads: 194 stocks
failed downloads: 4 stocks
date range: 2021-11-11 to 2024-12-30
trading days: 787

 metrics:
Mean daily return: 0.000292
Mean daily volatility: 0.3221

 Failed tickers: ['WRK', 'PXD', 'TWTR', 'DISH']


In [12]:
# gets the fama french factors from the site for the date range 

def get_daily_fama_french_factors(start_date='2021-11-11', end_date='2024-12-31'):
    """
    Downloads and processes the Fama-French 3-factor daily data from Ken French's website.
    """
    url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_daily_CSV.zip"
    r = requests.get(url)
    z = ZipFile(io.BytesIO(r.content))
    
    # filename from ZIP 
    with z.open("F-F_Research_Data_Factors_daily.csv") as f:
        df_raw = pd.read_csv(f, skiprows=3)

    # Drop footer rows that contain non-date strings
    df_raw = df_raw[df_raw.iloc[:, 0].str.match(r'^\d{6,8}$')].copy()

    # Rename columns
    df_raw.columns = ['Date', 'Mkt-RF', 'SMB', 'HML', 'RF']
    df_raw['Date'] = pd.to_datetime(df_raw['Date'], format='%Y%m%d')
    df_raw.set_index('Date', inplace=True)

    # Convert percent to decimal
    df_raw = df_raw.astype(float) / 100.0

    # Filter by date range
    mask = (df_raw.index >= pd.to_datetime(start_date)) & (df_raw.index <= pd.to_datetime(end_date))
    return df_raw.loc[mask]

# Example usage
ff_factors = get_daily_fama_french_factors('2021-11-11', '2024-12-31')

In [14]:
# autoencoder frameworks
def compute_optimal_A_b_mu(X_np, r):
    mu = np.mean(X_np, axis=0)
    cov = (X_np - mu).T @ (X_np - mu)
    U, S, _ = np.linalg.svd(cov)
    Ur = U[:, :r]  
    
    A = Ur @ Ur.T  
    b = np.zeros(r)  
    
    return A, b, mu, Ur


# plain linear autoencoder 
class ClassicAffineAutoencoder(nn.Module):
    def __init__(self, input_dim, r):
        super().__init__()
        self.input_dim = input_dim
        self.r = r
        
        self.projection = nn.Linear(input_dim, input_dim, bias=True)
        
        self.decoder = nn.Linear(r, input_dim, bias=True)
        
    def encoder(self, x):
        projected = self.projection(x)  
        return projected[:, :self.r]  

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat
    
# optimal autoencoder 
class OptimalAffineAutoencoder(nn.Module):
    def __init__(self, input_dim, r, Ur, mu):
        super().__init__()
        self.Ur = torch.tensor(Ur, dtype=torch.float32)  
        self.mu = torch.tensor(mu, dtype=torch.float32)

    def encoder(self, x):
        x_centered = x - self.mu
        return x_centered @ self.Ur  

    def decoder(self, z):
        return z @ self.Ur.T + self.mu  

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)
    
# nonlinear autoencoder
class NonlinearAutoencoder(nn.Module):
    def __init__(self, input_dim, bottleneck_dim, hidden_dim=5):
        super().__init__()
        
        # encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, bottleneck_dim)
        )
        
        # decoder:
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

# train/val function 
def train_autoencoder(model, train_loader, val_loader, num_epochs=300, lr=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            recon = model(batch)
            loss = criterion(recon, batch)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item() * batch.size(0)

        avg_train_loss = total_train_loss / len(train_loader.dataset)
        train_losses.append(avg_train_loss)

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                recon = model(batch)
                loss = criterion(recon, batch)
                total_val_loss += loss.item() * batch.size(0)

        avg_val_loss = total_val_loss / len(val_loader.dataset)
        val_losses.append(avg_val_loss)
    return model, train_losses, val_losses
#results['']

# val only 
def valOnlyOptimalAffineAutoencoder(model, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    model.eval()
    total_loss = 0
    criterion = nn.MSELoss()

    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            recon = model(batch)
            loss = criterion(recon, batch)
            total_loss += loss.item() * batch.size(0)

    avg_loss = total_loss / len(val_loader.dataset)
    return avg_loss 

In [15]:
# varimax and explained variance defs 
def varimax_rotation(loadings, gamma=1.0, q=20, tol=1e-6):
    loadings = np.array(loadings)
    n_features, n_factors = loadings.shape
    
    T = np.eye(n_factors)
    
    for iteration in range(q):
        T_old = T.copy()
        L = loadings @ T
        # Varimax criterion is to maximize variance of squared loadings
        # compute gradient of varimax objective
        u = n_features * L**3 - gamma * L @ np.diag(np.sum(L**2, axis=0))
        
        # SVD to find optimal rotation
        A = loadings.T @ u
        U, s, Vt = np.linalg.svd(A)
        T = U @ Vt

        # Check convergence
        if np.sum((T - T_old)**2) < tol:
            break
    
    rotated_loadings = loadings @ T
    return rotated_loadings, T

#  cumulative explained variance  
def compute_explained_variance(factors, original_data):
    loadings = np.linalg.lstsq(factors, original_data, rcond=None)[0]  
    
    # Reconstruct data using each factor individually
    total_var = np.var(original_data, axis=0, ddof=1).sum()
    explained_vars = []
    
    for i in range(factors.shape[1]):
        # Reconstruct using only factor i
        reconstructed = np.outer(factors[:, i], loadings[i, :])
        explained_var = np.var(reconstructed, axis=0, ddof=1).sum()
        explained_vars.append(explained_var)
    
    explained_var_ratio = np.array(explained_vars) / total_var
    cumulative_var = np.cumsum(explained_var_ratio)
    
    return explained_var_ratio, cumulative_var

def analyze_factor_loadings(factors, original_data, method_name=""):
    # Compute factor loadings via regression: loadings = (factors^T * factors)^-1 * factors^T * data
    factors = np.array(factors)
    original_data = np.array(original_data)
    
    # Compute loadings: each column is the loading for one factor
    loadings = np.linalg.lstsq(factors, original_data, rcond=None)[0].T  # shape: (n_features, n_factors)
    
    # Perform varimax rotation on loadings
    rotated_loadings, rotation_matrix = varimax_rotation(loadings)
    
    # Rotate the factors accordingly
    rotated_factors = factors @ rotation_matrix
    
    # Compute explained variance
    explained_var_ratio, cumulative_var = compute_explained_variance(factors, original_data)
    rotated_explained_var_ratio, rotated_cumulative_var = compute_explained_variance(rotated_factors, original_data)
    
    results = {
        'loadings': loadings,
        'rotated_loadings': rotated_loadings,
        'rotation_matrix': rotation_matrix,
        'rotated_factors': rotated_factors,
        'explained_var_ratio': explained_var_ratio,
        'cumulative_var': cumulative_var,
        'rotated_explained_var_ratio': rotated_explained_var_ratio,
        'rotated_cumulative_var': rotated_cumulative_var
    }
    return results

def add_factor_analysis_to_pipeline():
    # Get validation data for this run
    val_tensor = X_tensor[val_data.indices]
    X_val_np = val_tensor.numpy()
    
    # Analyze each method
    classic_analysis = analyze_factor_loadings(
        results['classic_factors'][-1], X_val_np, "Classic AE"
    )
    optimal_analysis = analyze_factor_loadings(
        results['optimal_factors'][-1], X_val_np, "Optimal AE"
    )
    pca_analysis = analyze_factor_loadings(
        results['pca_factors'][-1], X_val_np, "PCA"
    )
    nonlinear_analysis = analyze_factor_loadings(
        results['nonlinear_factors'][-1], X_val_np, "Nonlinear AE"
    )   
    
    # Store results
    results.setdefault('classic_analysis', []).append(classic_analysis)
    results.setdefault('optimal_analysis', []).append(optimal_analysis)
    results.setdefault('pca_analysis', []).append(pca_analysis)
    results.setdefault('nonlinear_analysis', []).append(nonlinear_analysis)



# After all runs complete, analyze average results
def analyze_average_factor_results(results):
   
    methods = ['classic', 'optimal', 'pca', 'nonlinear']
    
    print("AVERAGE FACTOR ANALYSIS RESULTS ACROSS ALL RUNS")
    
    for method in methods:
        analysis_key = f'{method}_analysis'
        if analysis_key in results:
            analyses = results[analysis_key]
            
            # Average explained variance ratios
            avg_explained_var = np.mean([a['explained_var_ratio'] for a in analyses], axis=0)
            avg_cumulative_var = np.mean([a['cumulative_var'] for a in analyses], axis=0)
            avg_rotated_explained_var = np.mean([a['rotated_explained_var_ratio'] for a in analyses], axis=0)
            avg_rotated_cumulative_var = np.mean([a['rotated_cumulative_var'] for a in analyses], axis=0)
            
            print(f"\n{method.upper()} METHOD:")
            print(f"  Average Explained Variance Ratio: {avg_explained_var}")
            print(f"  Average Cumulative Variance: {avg_cumulative_var}")
            print(f"  Average Rotated Explained Variance Ratio: {avg_rotated_explained_var}")
            print(f"  Average Rotated Cumulative Variance: {avg_rotated_cumulative_var}")
            
            # Show factor interpretability (how much variance is concentrated)
            loading_concentration = np.mean([np.std(np.abs(a['rotated_loadings']), axis=0) for a in analyses], axis=0)
            print(f"  Loading Concentration (higher = more interpretable): {loading_concentration}")

# THIS IS WHERE THE ACTUAL PIPELINE STARTS !!! 
X_np = framework.processed_data['returns']  
X_tensor = torch.from_numpy(X_np.values).float()    

n_samples = X_tensor.shape[0]
train_size = int(0.8 * n_samples)
val_size = n_samples - train_size

g = torch.Generator()
g.manual_seed(seed)

# slice
train_data = X_tensor[:train_size]
val_data = X_tensor[train_size:]
val_indices = np.arange(train_size, len(X_np))  
val_dates = X_np.index[val_indices]

# dataloaders
batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False)
val_loader   = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# inputs and dims 
input_dim = X_np.shape[1]
r = 3
A, b, mu, Ur = compute_optimal_A_b_mu(X_np, r)

# results dict 
results = {
    'optimal_mse': [],
    'optimal_factors': [],
}
    # init models
model_optimal = OptimalAffineAutoencoder(input_dim, r, Ur, mu)


loss_optimal_val = valOnlyOptimalAffineAutoencoder(model_optimal, val_loader)


X_train_np = train_data.numpy()
X_val_np = val_data.numpy()

pca = PCA(n_components=r)
pca.fit(X_train_np)

Z_pca_val = pca.transform(X_val_np)  # latent
X_pca_reconstructed = pca.inverse_transform(Z_pca_val)  # reconstruction

mse_pca = np.mean((X_val_np - X_pca_reconstructed) ** 2)
results.setdefault('pca_mse', []).append(mse_pca)
results.setdefault('pca_factors', []).append(Z_pca_val)

results['optimal_mse'].append(loss_optimal_val)

with torch.no_grad():
    X_val_np = val_data.numpy()
        
        # Store latent representations
    optimal_factors = model_optimal.encoder(val_data).cpu().numpy()
   

    results['optimal_factors'].append(optimal_factors)

# Perform factor analysis with rotation
    optimal_analysis = analyze_factor_loadings(optimal_factors, X_val_np, "")
    pca_analysis = analyze_factor_loadings(results['pca_factors'][-1], X_val_np, "")

# Store analysis results
    results.setdefault('optimal_analysis', []).append(optimal_analysis)
    results.setdefault('pca_analysis', []).append(pca_analysis)

# fix ff3 data to match validation dates 
val_dates_naive = val_dates.tz_localize(None)
ff_val = ff_factors.loc[val_dates_naive]
ff_factors_latent = ff_val[['Mkt-RF', 'SMB', 'HML']].values
ff_factors_latent = ff_factors_latent[:, :3]
X_dates_naive = X_np.index.tz_localize(None) if X_np.index.tz else X_np.index
ff_dates_naive = ff_factors.index.tz_localize(None) if ff_factors.index.tz else ff_factors.index
common_dates = X_dates_naive.intersection(ff_dates_naive)

results.setdefault('ff_factors', []).append(ff_factors_latent)
ff_analysis = analyze_factor_loadings(ff_factors_latent, X_val_np, "Fama-French (GT)")
results.setdefault('ff_analysis', []).append(ff_analysis)
results.setdefault('pca_factors', []).append(Z_pca_val)


print("Results Summary:")

print("Validation MSEs:")
for key in ['optimal_mse', 'pca_mse']:
    values = results[key]
    print(f"{key}: {np.mean(values):.8f}")
print("No training MSEs available for these models.")

# Compare factor stability across runs
def compare_factor_stability(results):
    
    print("Cumulative Explained Variance and Factor Stability:")

    methods = ['optimal', 'pca', 'ff']
    
    for method in methods:
        analysis_key = f'{method}_analysis'
        if analysis_key in results:
            analyses = results[analysis_key]
            
            # what does this do? 
            explained_vars = [a['rotated_explained_var_ratio'] for a in analyses]
            mean_explained = np.mean(explained_vars, axis=0)

            print(f"\n{method.upper()} - Cumulative Explained Variance Breakdown:")
            print(f"  Factor 1 : {mean_explained[0]:.3f}")
            print(f"  Factor 2 :   {mean_explained[1]:.3f} ")
            print(f"  Factor 3 :  {mean_explained[2]:.3f} ")
            
            total_explained = np.sum(explained_vars)
            print(f" Total Explained Variance: {total_explained:.3f}")
            
            # 
            max_factor = np.max(explained_vars)
            min_factor = np.min(explained_vars)
            balance_ratio = min_factor / max_factor
            print(f"  factor balance: {balance_ratio:.3f}")

compare_factor_stability(results)


Results Summary:
Validation MSEs:
optimal_mse: 0.00028766
pca_mse: 0.00029557
No training MSEs available for these models.
Cumulative Explained Variance and Factor Stability:

OPTIMAL - Cumulative Explained Variance Breakdown:
  Factor 1 : 0.125
  Factor 2 :   0.137 
  Factor 3 :  0.080 
 Total Explained Variance: 0.342
  factor balance: 0.586

PCA - Cumulative Explained Variance Breakdown:
  Factor 1 : 0.109
  Factor 2 :   0.144 
  Factor 3 :  0.080 
 Total Explained Variance: 0.333
  factor balance: 0.555

FF - Cumulative Explained Variance Breakdown:
  Factor 1 : 0.126
  Factor 2 :   0.018 
  Factor 3 :  0.103 
 Total Explained Variance: 0.248
  factor balance: 0.146


In [19]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_np = framework.processed_data['returns']  
X_tensor = torch.from_numpy(X_np.values).float()  
train_data = X_tensor[:train_size]
val_data = X_tensor[train_size:]
val_indices = np.arange(train_size, len(X_np))  
val_dates = X_np.index[val_indices]
val_tensor = X_tensor[val_indices]
X_val_np = val_tensor.numpy()
    
results = {
    'linear_analysis': [],
    'nonlinear_analysis': [],
    'linear_train_mse': [],
    'linear_val_mse': [],
    'nonlinear_train_mse': [],
    'nonlinear_val_mse': []
}

# Run 100 iterations
for run in range(100):

    # Print progress every 10 runs
    if (run + 1) % 10 == 0:
        print(f"Completed run {run + 1}/100")
    
    # Init model and train model
    modellinear = ClassicAffineAutoencoder(input_dim, r).to(device)
    modelnonlinear = NonlinearAutoencoder(input_dim, r).to(device)
    
    modellinear, train_losslinear, val_losslinear = train_autoencoder(
        modellinear, train_loader, val_loader, num_epochs=150, lr=0.001)
    modelnonlinear, train_lossnonlinear, val_lossnonlinear = train_autoencoder(
        modelnonlinear, train_loader, val_loader, num_epochs=100, lr=0.001)
    
    # Store final MSE values (assuming losses are MSE)
    results['linear_train_mse'].append(train_losslinear[-1])  # Final training MSE
    results['linear_val_mse'].append(val_losslinear[-1])      # Final validation MSE
    results['nonlinear_train_mse'].append(train_lossnonlinear[-1])  # Final training MSE
    results['nonlinear_val_mse'].append(val_lossnonlinear[-1])      # Final validation MSE
    
    # Get factors
    classic_factors = modellinear.encoder(val_data).detach().cpu().numpy()
    nonlinear_factors = modelnonlinear.encoder(val_data).detach().cpu().numpy()
    
    # Analyze factor loadings (without printing)
    classic_analysis = analyze_factor_loadings(classic_factors, X_val_np, "")
    nonlinear_analysis = analyze_factor_loadings(nonlinear_factors, X_val_np, "")
    
    # Store analysis results
    results['linear_analysis'].append(classic_analysis)
    results['nonlinear_analysis'].append(nonlinear_analysis)

# Extract explained variance ratios from each analysis
linear_explained_vars = [analysis['rotated_explained_var_ratio'] for analysis in results['linear_analysis']]
nonlinear_explained_vars = [analysis['rotated_explained_var_ratio'] for analysis in results['nonlinear_analysis']]

# Convert to numpy arrays
linear_explained_vars = np.array(linear_explained_vars)
nonlinear_explained_vars = np.array(nonlinear_explained_vars)

# Convert MSE lists to numpy arrays
linear_train_mse = np.array(results['linear_train_mse'])
linear_val_mse = np.array(results['linear_val_mse'])
nonlinear_train_mse = np.array(results['nonlinear_train_mse'])
nonlinear_val_mse = np.array(results['nonlinear_val_mse'])

# Compute statistics
linear_mean = np.mean(linear_explained_vars, axis=0)
linear_std = np.std(linear_explained_vars, axis=0)
nonlinear_mean = np.mean(nonlinear_explained_vars, axis=0)
nonlinear_std = np.std(nonlinear_explained_vars, axis=0)

# Compute MSE statistics
linear_train_mse_mean = np.mean(linear_train_mse)
linear_train_mse_std = np.std(linear_train_mse)
linear_val_mse_mean = np.mean(linear_val_mse)
linear_val_mse_std = np.std(linear_val_mse)
nonlinear_train_mse_mean = np.mean(nonlinear_train_mse)
nonlinear_train_mse_std = np.std(nonlinear_train_mse)
nonlinear_val_mse_mean = np.mean(nonlinear_val_mse)
nonlinear_val_mse_std = np.std(nonlinear_val_mse)

# Convert to arrays for more statistics
linear_cumulative = [analysis['rotated_cumulative_var'] for analysis in results['linear_analysis']]
nonlinear_cumulative = [analysis['rotated_cumulative_var'] for analysis in results['nonlinear_analysis']]

linear_cumulative = np.array(linear_cumulative)
nonlinear_cumulative = np.array(nonlinear_cumulative)

linear_cumulative_mean = np.mean(linear_cumulative, axis=0)
linear_cumulative_std = np.std(linear_cumulative, axis=0)
nonlinear_cumulative_mean = np.mean(nonlinear_cumulative, axis=0)
nonlinear_cumulative_std = np.std(nonlinear_cumulative, axis=0)

# results summary printing
print("\n Results summary across 100 runs")

# MSE results
print("\n MSE results:")
print(f"\n Training MSEs:")
print(f"  linear model:    {linear_train_mse_mean:.8f} ± {linear_train_mse_std:.8f}")
print(f"  nonlinear model: {nonlinear_train_mse_mean:.8f} ± {nonlinear_train_mse_std:.8f}")

print(f"\n Validaton MSEs:")
print(f"  Linear Model:    {linear_val_mse_mean:.8f} ± {linear_val_mse_std:.8f}")
print(f"  Nonlinear Model: {nonlinear_val_mse_mean:.8f} ± {nonlinear_val_mse_std:.8f}")

# Factor stability results
print("\n Cumulative Explained Variance and Factor Stability:")

methods = [('Linear', linear_mean, linear_std, linear_cumulative_std), ('Nonlinear', nonlinear_mean, nonlinear_std, nonlinear_cumulative_std)]

for method_name, mean_explained, std_explained, cumulative_std in methods:
    print(f"\n{method_name} - Rotated Factor Stability:")
    print(f"  Factor 1: {mean_explained[0]:.4f} ± {std_explained[0]:.4f}")
    print(f"  Factor 2: {mean_explained[1]:.4f} ± {std_explained[1]:.4f}")
    print(f"  Factor 3: {mean_explained[2]:.4f} ± {std_explained[2]:.4f}")
    
    # Standard deviation calculations
    total_explained = np.sum(mean_explained)
    std_explained = np.std(total_explained)
    print(f" Total Explained Variance: {total_explained:.4f} ± {cumulative_std[2]:.4f}")

    # Factor balance calculations
    max_factor = np.max(mean_explained)
    min_factor = np.min(mean_explained)
    balance_ratio = min_factor / max_factor
    balance_std = np.std(balance_ratio)
    print(f"  Factor Balance Ratio: {balance_ratio:.4f}")

Completed run 10/100
Completed run 20/100
Completed run 30/100
Completed run 40/100
Completed run 50/100
Completed run 60/100
Completed run 70/100
Completed run 80/100
Completed run 90/100
Completed run 100/100

 Results summary across 100 runs

 MSE results:

 Training MSEs:
  linear model:    0.00049307 ± 0.00000990
  nonlinear model: 0.00050453 ± 0.00000664

 Validaton MSEs:
  Linear Model:    0.00040028 ± 0.00000420
  Nonlinear Model: 0.00040572 ± 0.00000308

 Cumulative Explained Variance and Factor Stability:

Linear - Rotated Factor Stability:
  Factor 1: 0.0513 ± 0.0474
  Factor 2: 0.0487 ± 0.0456
  Factor 3: 0.0555 ± 0.0567
 Total Explained Variance: 0.1556 ± 0.0453
  Factor Balance Ratio: 0.8777

Nonlinear - Rotated Factor Stability:
  Factor 1: 0.0648 ± 0.0657
  Factor 2: 0.0840 ± 0.0595
  Factor 3: 0.0485 ± 0.0601
 Total Explained Variance: 0.1974 ± 0.0611
  Factor Balance Ratio: 0.5775
