In [2]:
import os
import pickle
from datetime import datetime
import numpy as np
import torch
import matplotlib.pyplot as plt
from scipy import optimize
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D

# Import your custom libraries
import EnsRFTheory  # Ensure EnsRFTheory.py is in the same directory or Python path
import EnsembleRFs
import auxFuncs
import DatasetMaker
import LearningCurveExperiments

# Set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [3]:
import importlib
importlib.reload(EnsRFTheory)
importlib.reload(EnsembleRFs)
importlib.reload(auxFuncs)
importlib.reload(DatasetMaker)
importlib.reload(LearningCurveExperiments)

<module 'LearningCurveExperiments' from '/n/home07/bruben/Simulations/Ensemble_DeepLearning/RandomFeatures/LearningCurveExperiments.py'>

# Figure 1: Sample and Size Monotonicity in Ensembles

## CIFAR Experiments Sweep P with N, K fixed

In [178]:
# Parameters
num_trials = 50
KVals = [1, 2, 4, 8]
N = 256
lamVals = np.logspace(-3, 1, 50)
Plower = 1
Pupper = 4
P_list = [int(i) for i in np.unique(np.round(np.logspace(Plower, Pupper, 20)))]
P_list_theory =np.logspace(Plower, Pupper, 200)
PTest = 10000
nonlinearity = torch.relu  # Relu random features.
ensErrFuncs = [(auxFuncs.mean, auxFuncs.SquareError), (auxFuncs.mean, auxFuncs.SgnErrorRate), (auxFuncs.majorityVote, auxFuncs.SgnErrorRate), (auxFuncs.median, auxFuncs.SgnErrorRate)]

In [179]:
data_root = '/n/holystore01/LABS/pehlevan_lab/Everyone/cifar' #Location of the cifar 10 dataset.
class_groups =  [[0,1,7,8,9], [2,3,4,5,6]]

In [180]:
X_train_np, y_train_np, X_test_np, y_test_np = DatasetMaker.get_binarized_CIFAR10(data_root, class_groups, flatten=True, normalize=True)

Files already downloaded and verified
Files already downloaded and verified


In [181]:
D = X_train_np.shape[1]
Fvar = 2/D

In [182]:
# Convert data to torch tensors
X_train = torch.tensor(X_train_np, dtype=torch.float64).cuda()
y_train = torch.tensor(y_train_np, dtype=torch.float64).cuda().reshape(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float64).cuda()
y_test = torch.tensor(y_test_np, dtype=torch.float64).cuda().reshape(-1, 1)

In [183]:
test_errors = LearningCurveExperiments.train_random_feature_models_fixN(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    num_trials, 
    KVals, 
    N, 
    lamVals, 
    P_list, 
    ensErrFuncs, 
    nonlinearity=nonlinearity, 
    Fvar = Fvar)

Starting trial 1/50
Completed trial 1/50
Starting trial 2/50
Completed trial 2/50
Starting trial 3/50
Completed trial 3/50
Starting trial 4/50
Completed trial 4/50
Starting trial 5/50
Completed trial 5/50
Starting trial 6/50
Completed trial 6/50
Starting trial 7/50
Completed trial 7/50
Starting trial 8/50
Completed trial 8/50
Starting trial 9/50
Completed trial 9/50
Starting trial 10/50
Completed trial 10/50
Starting trial 11/50
Completed trial 11/50
Starting trial 12/50
Completed trial 12/50
Starting trial 13/50
Completed trial 13/50
Starting trial 14/50
Completed trial 14/50
Starting trial 15/50
Completed trial 15/50
Starting trial 16/50
Completed trial 16/50
Starting trial 17/50
Completed trial 17/50
Starting trial 18/50
Completed trial 18/50
Starting trial 19/50
Completed trial 19/50
Starting trial 20/50
Completed trial 20/50
Starting trial 21/50
Completed trial 21/50
Starting trial 22/50
Completed trial 22/50
Starting trial 23/50
Completed trial 23/50
Starting trial 24/50
Complete

In [184]:
#importlib.reload(LearningCurveExperiments)

In [185]:
# Define the subset size and file directory
subset_size = 30000  # Set your specific subset size
save_dir = "KernelSpectra"

# Load the saved dictionary for the given subset size
with open(os.path.join(save_dir, f'CIFAR_kernel_results_{subset_size}.pkl'), 'rb') as f:
    results = pickle.load(f)

# Retrieve eigenvalues, wbar, and sigma_eps from the dictionary
eigvals = results['eigvals'].reshape(-1)
wbar = results['wbar'].reshape(-1)
sigma_eps = results['sigma_eps']

# Compute theoretical learning curves using the retrieved eigenvalues, wbar, and sigma_eps
test_errors_theory = LearningCurveExperiments.compute_theoretical_learning_curves_fixN(
    Sigma=eigvals,       # Sigma corresponds to the kernel eigenvalues
    wbar=wbar,           # Ground truth weights from CIFAR10 task
    KVals=KVals,         # Number of ensemble models
    lamVals=lamVals,     # Ridge regularization values
    P_list=P_list_theory,       # Different training sample sizes
    N=N,                 # Total number of random features
    sigma_eps = sigma_eps        #noise level in the task.
)

# Print or save the theoretical learning curves as needed
print("Theoretical learning curves computed successfully.")

Theoretical learning curves computed successfully.


In [186]:
# Define the experiment output and parameters as a dictionary
experiment_results = {
    'num_trials': num_trials,
    'KVals': KVals,
    'N': N,
    'lamVals': lamVals,
    'P_list': P_list,
    'P_list_theory': P_list_theory,
    'PTest': PTest,
    'nonlinearity': nonlinearity,
    'ensErrFuncs': ensErrFuncs,
    'class_groups': class_groups,  # Class groups for CIFAR-10 binarization
    'test_errors': test_errors,  # Numerical results from random feature models
    'test_errors_theory': test_errors_theory,  # Theoretical results
    'ensErrFuncs': ensErrFuncs
}

# Create a descriptive filename with a timestamp to avoid overwriting
#timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#filename = f'RF_CIFAR_{timestamp}.pkl'
filename = f'RF_CIFAR_N{N}_VSP.pkl'

# Define the save path
save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

# Ensure the directory exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the dictionary as a pickle file
with open(os.path.join(save_path, filename), 'wb') as f:
    pickle.dump(experiment_results, f)

print(f"Experiment saved to {os.path.join(save_path, filename)}")

Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_CIFAR_N256_VSP.pkl


## CIFAR Experiments Sweeping N and K with P fixed, many ensErrFuncs

In [113]:
# Parameters
num_trials = 50
Nlower=1
Nupper=4
NVals = [int(i) for i in np.unique(np.round(np.logspace(Nlower, Nupper, 20)))]  # List of M values to sweep over
NVals_theory =np.logspace(Nlower, Nupper, 200)
KVals = [1, 2, 4, 8]  # List of K values
lamVals = np.logspace(-3, 1, 10)
P = 256  # Fixed training sample size
PTest = 10000
nonlinearity = torch.relu  # ReLU random features.
ensErrFuncs = [(auxFuncs.mean, auxFuncs.SquareError), (auxFuncs.mean, auxFuncs.SgnErrorRate), (auxFuncs.majorityVote, auxFuncs.SgnErrorRate), (auxFuncs.median, auxFuncs.SgnErrorRate)]

In [114]:
# Data root and class groups for CIFAR-10 binarization
data_root = '/n/holystore01/LABS/pehlevan_lab/Everyone/cifar'  # Location of the CIFAR-10 dataset.
class_groups = [[0, 1, 7, 8, 9], [2, 3, 4, 5, 6]]

# Load and preprocess CIFAR-10 data
X_train_np, y_train_np, X_test_np, y_test_np = DatasetMaker.get_binarized_CIFAR10(
    data_root, class_groups, flatten=True, normalize=True
)

D = X_train_np.shape[1]
Fvar = 2 / D

# Convert data to torch tensors and move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train = torch.tensor(X_train_np, dtype=torch.float64).to(device)
y_train = torch.tensor(y_train_np, dtype=torch.float64).to(device).reshape(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float64).to(device)
y_test = torch.tensor(y_test_np, dtype=torch.float64).to(device).reshape(-1, 1)

# Ensure that the training data is large enough for all trials
total_train_samples = P * num_trials
if X_train.shape[0] < total_train_samples:
    # Optionally, you can augment the training data here
    raise ValueError("Not enough training samples for the specified number of trials.")

# Run the numerical experiments using the new function
test_errors = LearningCurveExperiments.train_random_feature_models_N_K(
    X_train, y_train, X_test, y_test,
    num_trials, NVals, KVals, lamVals, P, ensErrFuncs,
    nonlinearity=nonlinearity, Fvar=Fvar
)

# Compute mean and standard deviation over trials
mean_test_errors = np.mean(test_errors, axis=0)
std_test_errors = np.std(test_errors, axis=0)

Files already downloaded and verified
Files already downloaded and verified
Starting trial 1/50
Completed trial 1/50
Starting trial 2/50
Completed trial 2/50
Starting trial 3/50
Completed trial 3/50
Starting trial 4/50
Completed trial 4/50
Starting trial 5/50
Completed trial 5/50
Starting trial 6/50
Completed trial 6/50
Starting trial 7/50
Completed trial 7/50
Starting trial 8/50
Completed trial 8/50
Starting trial 9/50
Completed trial 9/50
Starting trial 10/50
Completed trial 10/50
Starting trial 11/50
Completed trial 11/50
Starting trial 12/50
Completed trial 12/50
Starting trial 13/50
Completed trial 13/50
Starting trial 14/50
Completed trial 14/50
Starting trial 15/50
Completed trial 15/50
Starting trial 16/50
Completed trial 16/50
Starting trial 17/50
Completed trial 17/50
Starting trial 18/50
Completed trial 18/50
Starting trial 19/50
Completed trial 19/50
Starting trial 20/50
Completed trial 20/50
Starting trial 21/50
Completed trial 21/50
Starting trial 22/50
Completed trial 22

In [121]:
importlib.reload(LearningCurveExperiments)

<module 'LearningCurveExperiments' from '/n/home07/bruben/Simulations/Ensemble_DeepLearning/RandomFeatures/LearningCurveExperiments.py'>

In [125]:
# Define the subset size and file directory
subset_size = 30000  # Set your specific subset size
save_dir = "KernelSpectra"

# Load the saved dictionary for the given subset size
with open(os.path.join(save_dir, f'CIFAR_kernel_results_{subset_size}.pkl'), 'rb') as f:
    results = pickle.load(f)

# Retrieve eigenvalues, wbar, and sigma_eps from the dictionary
eigvals = results['eigvals'].reshape(-1)
wbar = results['wbar'].reshape(-1)
sigma_eps = results['sigma_eps']

# Compute theoretical learning curves using the new function
test_errors_theory = LearningCurveExperiments.compute_theoretical_learning_curves_N_K(
    Sigma=eigvals,       # Sigma corresponds to the kernel eigenvalues
    wbar=wbar,           # Ground truth weights from CIFAR-10 task
    NVals=NVals_theory,
    KVals=KVals,
    lamVals=lamVals,
    P=P,                 # Fixed training sample size
    sigma_eps=0          # Noise level in the task.
)

print("Theoretical learning curves computed successfully.")

Theoretical learning curves computed successfully.


In [128]:
# Define the experiment output and parameters as a dictionary
experiment_results = {
    'num_trials': num_trials,
    'NVals': NVals,
    'NVals_theory': NVals_theory,
    'KVals': KVals,
    'lamVals': lamVals,
    'P': P,
    'PTest': PTest,
    'nonlinearity': nonlinearity,
    'class_groups': class_groups,  # Class groups for CIFAR-10 binarization
    'test_errors': test_errors,          # Numerical results from random feature models
    'test_errors_theory': test_errors_theory,  # Theoretical results
    'mean_test_errors': mean_test_errors,
    'std_test_errors': std_test_errors
}

# Create a descriptive filename with a timestamp to avoid overwriting
filename = f'RF_CIFAR_P{P}_VsN.pkl'

# Define the save path
save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

# Ensure the directory exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the dictionary as a pickle file
with open(os.path.join(save_path, filename), 'wb') as f:
    pickle.dump(experiment_results, f)

print(f"Experiment saved to {os.path.join(save_path, filename)}")

Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_CIFAR_P256_VsN.pkl


# Figure 2: $E_g$ increase monotonicity with $K$ when $P$ and $M$ are fixed.

Modify to determine how Bias and Variance depend on K at optimal ridge.ensErrFuncsreturnBiasVariance=

## Gaussian Data Experiment Sweeping K with fixed M and P (Multiple P values)

In [46]:
# Parameters
num_trials = 10
KVals = [1, 2, 4, 8, 16, 32, 64, 128]
M = 2**10
lamVals = np.logspace(-4, 2, 50)
P_list = [2**7, 2**10, 2**13]
PTest = 10000
nonlinearity = None  # Assuming linear model
ensErrFuncs = [(auxFuncs.mean, auxFuncs.SquareError)]

In [47]:
alpha = 1.2
r = .4
D = 10000  # Dimensionality of data
sigma_eps = 0

sigma_s, w_star = LearningCurveExperiments.makeGaussianParams(D, alpha, r)

In [48]:
X_train, y_train = DatasetMaker.makeGaussianDataset_lin(max(P_list)*num_trials, w_star, sigma_s, sigma_eps = sigma_eps)
X_test, y_test = DatasetMaker.makeGaussianDataset_lin(PTest, w_star, sigma_s, sigma_eps = sigma_eps)

In [49]:
# Train random feature models
test_errors = LearningCurveExperiments.train_random_feature_models_fixM(X_train, y_train, X_test, y_test, num_trials, KVals, M, lamVals, P_list, ensErrFuncs, nonlinearity=nonlinearity, Fvar = 1)

Starting trial 1/10
Completed trial 1/10
Starting trial 2/10
Completed trial 2/10
Starting trial 3/10
Completed trial 3/10
Starting trial 4/10
Completed trial 4/10
Starting trial 5/10
Completed trial 5/10
Starting trial 6/10
Completed trial 6/10
Starting trial 7/10
Completed trial 7/10
Starting trial 8/10
Completed trial 8/10
Starting trial 9/10
Completed trial 9/10
Starting trial 10/10
Completed trial 10/10


In [50]:
#Calculate theory curves
test_errors_theory, bias_theory, var_theory = LearningCurveExperiments.compute_theoretical_learning_curves(sigma_s.cpu().numpy(), w_star.cpu().numpy(), KVals, lamVals, P_list, M, sigma_eps = sigma_eps, returnBiasVariance = True)

In [51]:
# Define the experiment output and parameters as a dictionary
experiment_results = {
    'num_trials': num_trials,
    'KVals': KVals,
    'M': M,
    'lamVals': lamVals,
    'P_list': P_list,
    'PTest': PTest,
    'nonlinearity': nonlinearity,
    'alpha': alpha,
    'r': r,
    'D': D,
    'test_errors': test_errors,  # Assuming this is the output of train_random_feature_models
    'test_errors_theory': test_errors_theory
}

# Create a descriptive filename with a timestamp to avoid overwriting
#timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#filename = f'RF_Gaussian_{timestamp}.pkl'
filename = rf'RF_Gaussian_alpha{alpha}_r{r}_M_{M}.pkl'

# Define the save path
save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

# Ensure the directory exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the dictionary as a pickle file
with open(os.path.join(save_path, filename), 'wb') as f:
    pickle.dump(experiment_results, f)

print(f"Experiment saved to {os.path.join(save_path, filename)}")

Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_Gaussian_alpha1.2_r0.4_M_1024.pkl


# CIFAR 10 Ensemble Regression: Sweeep K with P and M fixed (multiple P values)

In [4]:
# Parameters
num_trials = 50
KVals = [1, 2, 4, 8, 16, 32, 64, 128]
M = 2**10
lamVals = np.logspace(-4, 2, 50)
P_list = [2**7, 2**10, 2**13]
PTest = 10000
nonlinearity = torch.relu  # Relu random features.
ensErrFuncs = [(auxFuncs.mean, auxFuncs.SquareError), (auxFuncs.mean, auxFuncs.SgnErrorRate), (auxFuncs.majorityVote, auxFuncs.SgnErrorRate)]

In [5]:
data_root = '/n/holystore01/LABS/pehlevan_lab/Everyone/cifar' #Location of the cifar 10 dataset.
class_groups =  [[0,1,7,8,9], [2,3,4,5,6]]

In [6]:
X_train_np, y_train_np, X_test_np, y_test_np = DatasetMaker.get_binarized_CIFAR10(data_root, class_groups, flatten=True, normalize=True)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
D = X_train_np.shape[1]
Fvar = 2/D

In [8]:
# Convert data to torch tensors
X_train = torch.tensor(X_train_np, dtype=torch.float64).cuda()
y_train = torch.tensor(y_train_np, dtype=torch.float64).cuda().reshape(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float64).cuda()
y_test = torch.tensor(y_test_np, dtype=torch.float64).cuda().reshape(-1, 1)

In [9]:
test_errors = LearningCurveExperiments.train_random_feature_models_fixM(X_train, y_train, X_test, y_test, num_trials, KVals, M, lamVals, P_list, ensErrFuncs, nonlinearity=nonlinearity, Fvar = Fvar)

Starting trial 1/50
Completed trial 1/50
Starting trial 2/50
Completed trial 2/50
Starting trial 3/50
Completed trial 3/50
Starting trial 4/50
Completed trial 4/50
Starting trial 5/50
Completed trial 5/50
Starting trial 6/50
Completed trial 6/50
Starting trial 7/50
Completed trial 7/50
Starting trial 8/50
Completed trial 8/50
Starting trial 9/50
Completed trial 9/50
Starting trial 10/50
Completed trial 10/50
Starting trial 11/50
Completed trial 11/50
Starting trial 12/50
Completed trial 12/50
Starting trial 13/50
Completed trial 13/50
Starting trial 14/50
Completed trial 14/50
Starting trial 15/50
Completed trial 15/50
Starting trial 16/50
Completed trial 16/50
Starting trial 17/50
Completed trial 17/50
Starting trial 18/50
Completed trial 18/50
Starting trial 19/50
Completed trial 19/50
Starting trial 20/50
Completed trial 20/50
Starting trial 21/50
Completed trial 21/50
Starting trial 22/50
Completed trial 22/50
Starting trial 23/50
Completed trial 23/50
Starting trial 24/50
Complete

In [10]:
# Define the subset size and file directory
subset_size = 30000  # Set your specific subset size
save_dir = "KernelSpectra"

# Load the saved dictionary for the given subset size
with open(os.path.join(save_dir, f'CIFAR_kernel_results_{subset_size}.pkl'), 'rb') as f:
    results = pickle.load(f)

# Retrieve eigenvalues, wbar, and sigma_eps from the dictionary
eigvals = results['eigvals'].reshape(-1)
wbar = results['wbar'].reshape(-1)
sigma_eps = results['sigma_eps']

# Compute theoretical learning curves using the retrieved eigenvalues, wbar, and sigma_eps
test_errors_theory, bias_theory, var_theory = LearningCurveExperiments.compute_theoretical_learning_curves_fixM(
    Sigma=eigvals,       # Sigma corresponds to the kernel eigenvalues
    wbar=wbar,           # Ground truth weights from CIFAR10 task
    KVals=KVals,         # Number of ensemble models
    lamVals=lamVals,     # Ridge regularization values
    P_list=P_list,       # Different training sample sizes
    M=M,                 # Total number of random features
    sigma_eps = sigma_eps,        #noise level in the task.
    returnBiasVariance = True
)

# Print or save the theoretical learning curves as needed
print("Theoretical learning curves computed successfully.")

Theoretical learning curves computed successfully.


In [11]:
# Define the experiment output and parameters as a dictionary
experiment_results = {
    'num_trials': num_trials,
    'KVals': KVals,
    'M': M,
    'lamVals': lamVals,
    'P_list': P_list,
    'PTest': PTest,
    'nonlinearity': nonlinearity,
    'class_groups': class_groups,  # Class groups for CIFAR-10 binarization
    'test_errors': test_errors,  # Numerical results from random feature models
    'test_errors_theory': test_errors_theory,  # Theoretical results
    'bias_theory': bias_theory,
    'var_theory': var_theory,
    'ensErrFuncs': ensErrFuncs
}

# Create a descriptive filename with a timestamp to avoid overwriting
#timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#filename = f'RF_CIFAR_{timestamp}.pkl'
filename = f'RF_CIFAR.pkl'

# Define the save path
save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

# Ensure the directory exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the dictionary as a pickle file
with open(os.path.join(save_path, filename), 'wb') as f:
    pickle.dump(experiment_results, f)

print(f"Experiment saved to {os.path.join(save_path, filename)}")

Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_CIFAR.pkl


## MNIST Ensemble Experiments sweeping K with M fixed (multiple P values)

In [12]:
# Parameters
num_trials = 50
KVals = [1, 2, 4, 8, 16, 32, 64, 128]
M = 2**10
lamVals = np.logspace(-4, 2, 50)
P_list = [2**7, 2**10, 2**13]
PTest = 10000
nonlinearity = torch.relu  # Relu random features.
ensErrFuncs = [(auxFuncs.mean, auxFuncs.SquareError), (auxFuncs.mean, auxFuncs.SgnErrorRate), (auxFuncs.majorityVote, auxFuncs.SgnErrorRate)]

In [13]:
data_root = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Datasets/MNIST'  # Location to store MNIST dataset
class_groups = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]  # Group digits 0-4 and 5-9

In [14]:
# Load binarized MNIST data
X_train_np, y_train_np, X_test_np, y_test_np = DatasetMaker.get_binarized_MNIST(
    data_root, class_groups, flatten=True, normalize=True
)

In [15]:
D = X_train_np.shape[1]
Fvar = 2/D

In [16]:
# Convert data to torch tensors
X_train = torch.tensor(X_train_np, dtype=torch.float64).cuda()
y_train = torch.tensor(y_train_np, dtype=torch.float64).cuda().reshape(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float64).cuda()
y_test = torch.tensor(y_test_np, dtype=torch.float64).cuda().reshape(-1, 1)

In [17]:
test_errors = LearningCurveExperiments.train_random_feature_models_fixM(X_train, y_train, X_test, y_test, num_trials, KVals, M, lamVals, P_list, ensErrFuncs, nonlinearity=nonlinearity, Fvar = Fvar)

Starting trial 1/50
Completed trial 1/50
Starting trial 2/50
Completed trial 2/50
Starting trial 3/50
Completed trial 3/50
Starting trial 4/50
Completed trial 4/50
Starting trial 5/50
Completed trial 5/50
Starting trial 6/50
Completed trial 6/50
Starting trial 7/50
Completed trial 7/50
Starting trial 8/50
Completed trial 8/50
Starting trial 9/50
Completed trial 9/50
Starting trial 10/50
Completed trial 10/50
Starting trial 11/50
Completed trial 11/50
Starting trial 12/50
Completed trial 12/50
Starting trial 13/50
Completed trial 13/50
Starting trial 14/50
Completed trial 14/50
Starting trial 15/50
Completed trial 15/50
Starting trial 16/50
Completed trial 16/50
Starting trial 17/50
Completed trial 17/50
Starting trial 18/50
Completed trial 18/50
Starting trial 19/50
Completed trial 19/50
Starting trial 20/50
Completed trial 20/50
Starting trial 21/50
Completed trial 21/50
Starting trial 22/50
Completed trial 22/50
Starting trial 23/50
Completed trial 23/50
Starting trial 24/50
Complete

In [18]:
# Define the subset size and file directory
subset_size = 30000  # Set your specific subset size
save_dir = "KernelSpectra"

# Load the saved dictionary for the given subset size
with open(os.path.join(save_dir, f'MNIST_kernel_results_{subset_size}.pkl'), 'rb') as f:
    results = pickle.load(f)

# Retrieve eigenvalues, wbar, and sigma_eps from the dictionary
eigvals = results['eigvals'].reshape(-1)
wbar = results['wbar'].reshape(-1)
sigma_eps = results['sigma_eps']

# Compute theoretical learning curves using the retrieved eigenvalues, wbar, and sigma_eps
test_errors_theory, bias_theory, var_thoery = LearningCurveExperiments.compute_theoretical_learning_curves_fixM(
    Sigma=eigvals,       # Sigma corresponds to the kernel eigenvalues
    wbar=wbar,           # Ground truth weights from CIFAR10 task
    KVals=KVals,         # Number of ensemble models
    lamVals=lamVals,     # Ridge regularization values
    P_list=P_list,       # Different training sample sizes
    M=M,                 # Total number of random features
    sigma_eps = sigma_eps,        #noise level in the task.
    returnBiasVariance = True
)

# Print or save the theoretical learning curves as needed
print("Theoretical learning curves computed successfully.")

Theoretical learning curves computed successfully.


In [19]:
# Define the experiment output and parameters as a dictionary
experiment_results = {
    'num_trials': num_trials,
    'KVals': KVals,
    'M': M,
    'lamVals': lamVals,
    'P_list': P_list,
    'PTest': PTest,
    'nonlinearity': nonlinearity,
    'class_groups': class_groups,  # Class groups for CIFAR-10 binarization
    'test_errors': test_errors,  # Numerical results from random feature models
    'test_errors_theory': test_errors_theory,  # Theoretical results
    'bias_theory': bias_theory,
    'var_theory': var_theory,
    'ensErrFuncs': ensErrFuncs
}

# Create a descriptive filename with a timestamp to avoid overwriting
#timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#filename = f'RF_CIFAR_{timestamp}.pkl'
filename = f'RF_MNIST.pkl'

# Define the save path
save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

# Ensure the directory exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the dictionary as a pickle file
with open(os.path.join(save_path, filename), 'wb') as f:
    pickle.dump(experiment_results, f)

print(f"Experiment saved to {os.path.join(save_path, filename)}")

Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_MNIST.pkl


# Figure 3: Scaling Laws for Random Feature EnsemblesensErrFuncs

## Gaussian Data Experiment Sweeping $\ell$ and $M$.  Adding a sweep over $r$

In [83]:
importlib.reload(LearningCurveExperiments)
importlib.reload(EnsembleRFs)
importlib.reload(EnsRFTheory)

<module 'EnsRFTheory' from '/n/home07/bruben/Simulations/Ensemble_DeepLearning/RandomFeatures/EnsRFTheory.py'>

In [84]:
# Parameters
num_trials = 5
M_list = np.unique(np.round(np.logspace(2, 3, 6)))
ell_list = np.linspace(.2, 1, 9)
lamVals = np.logspace(-3, 3, 100)
P = 30000
nonlinearity = None  # Assuming linear model

In [85]:
alpha = 1.5
r_list = [.4, .8, 1.2]
D = 30000  # Dimensionality of data
sigma_eps = 0

In [86]:
for r in r_list:
    print('Starting r = ' + str(r))

    # Perform training using the new function
    test_errors = LearningCurveExperiments.train_random_feature_models_ell_synthetic(
        num_trials,
        ell_list,
        M_list,
        P,
        lamVals,
        D,
        alpha,
        r,
        Fvar=1,
        sigma_eps=sigma_eps
    )

    # Generate sigma_s and w_star for theoretical computations
    sigma_s, w_star = LearningCurveExperiments.makeGaussianParams(D, alpha, r)

    # Perform theoretical computations
    #NOTE: During Theory Computations, you must use fractional values for N and K to get the true exponents.  This is ok because only the ratios matter.
    test_errors_theory = LearningCurveExperiments.compute_theoretical_learning_curves_ell(
        sigma_s.cpu().numpy(),
        w_star.cpu().numpy(),
        ell_list,
        lamVals,
        P,
        M_list,
        sigma_eps=sigma_eps
    )

    # Define the experiment output and parameters as a dictionary
    experiment_results = {
        'num_trials': num_trials,
        'ell_list': ell_list,
        'M_list': M_list,
        'lamVals': lamVals,
        'P': P,
        'nonlinearity': nonlinearity,
        'alpha': alpha,
        'r': r,
        'D': D,
        'sigma_eps': sigma_eps,
        'test_errors': test_errors,
        'test_errors_theory': test_errors_theory
    }

    # Include key parameters in the filename for clarity
    M_min = int(M_list[0])
    M_max = int(M_list[-1])
    filename = f'RF_Gaussian_alpha{alpha}_r{r}_M_{M_min}-{M_max}_P_{P}_trials_{num_trials}.pkl'

    # Define the save path
    save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

    # Ensure the directory exists
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Save the dictionary as a pickle file
    with open(os.path.join(save_path, filename), 'wb') as f:
        pickle.dump(experiment_results, f)

    print(f"Experiment saved to {os.path.join(save_path, filename)}")

Starting r = 0.4
Starting trial 1/5
Starting M = 100.0
Starting M = 158.0
Starting M = 251.0
Starting M = 398.0
Starting M = 631.0
Starting M = 1000.0
Completed trial 1/5
Starting trial 2/5
Starting M = 100.0
Starting M = 158.0
Starting M = 251.0
Starting M = 398.0
Starting M = 631.0
Starting M = 1000.0
Completed trial 2/5
Starting trial 3/5
Starting M = 100.0
Starting M = 158.0
Starting M = 251.0
Starting M = 398.0
Starting M = 631.0
Starting M = 1000.0
Completed trial 3/5
Starting trial 4/5
Starting M = 100.0
Starting M = 158.0
Starting M = 251.0
Starting M = 398.0
Starting M = 631.0
Starting M = 1000.0
Completed trial 4/5
Starting trial 5/5
Starting M = 100.0
Starting M = 158.0
Starting M = 251.0
Starting M = 398.0
Starting M = 631.0
Starting M = 1000.0
Completed trial 5/5
Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_Gaussian_alpha1.5_r0.4_M_100-1000_P_30000_trials_5.pkl
Starting r = 0.8
Starting trial 1/5
Starting M = 100.0
Starting M 

# CIFAR 10 Experiment Sweeping $M$, $\ell$

In [46]:
# Parameters
num_trials = 10
M_list = np.unique(np.round(np.logspace(2, 3, 15))).astype(int)  # M values from 1e3 to 1e4
ell_list = np.linspace(.1, 1, 10)  # 11 values between 0 and 1
lamVals = np.logspace(-4, 2, 50)
P = 50000  # Fixed training sample size -- use the whole training set
PTest = 10000  #Use the whole test set.
nonlinearity = torch.relu  # ReLU random features

# Data root and class groups for CIFAR-10 binarization
data_root = '/n/holystore01/LABS/pehlevan_lab/Everyone/cifar'  # Location of the CIFAR-10 dataset
class_groups = [[0, 1, 7, 8, 9], [2, 3, 4, 5, 6]]

# Load and preprocess CIFAR-10 data
X_train_np, y_train_np, X_test_np, y_test_np = DatasetMaker.get_binarized_CIFAR10(
    data_root, class_groups, flatten=True, normalize=True
)

D = X_train_np.shape[1]
Fvar = 2 / D  # Variance for random features

# Convert data to torch tensors and move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train = torch.tensor(X_train_np, dtype=torch.float64).to(device)
y_train = torch.tensor(y_train_np, dtype=torch.float64).to(device).reshape(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float64).to(device)
y_test = torch.tensor(y_test_np, dtype=torch.float64).to(device).reshape(-1, 1)

# Ensure that the training data is large enough for all trials
# total_train_samples = P * num_trials
# if X_train.shape[0] < total_train_samples:
#     # Optionally, you can augment the training data here
#     raise ValueError("Not enough training samples for the specified number of trials.")

Files already downloaded and verified
Files already downloaded and verified


In [47]:
# Run experiments using the new ell sweep function
test_errors = LearningCurveExperiments.train_random_feature_models_ell(
    X_train, y_train, X_test, y_test,
    num_trials, ell_list, M_list, P, lamVals,
    nonlinearity=nonlinearity, Fvar=Fvar
)

Starting trial 1/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Starting M = 228
Starting M = 268
Starting M = 316
Starting M = 373
Starting M = 439
Starting M = 518
Starting M = 611
Starting M = 720
Starting M = 848
Starting M = 1000
Completed trial 1/10
Starting trial 2/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Starting M = 228
Starting M = 268
Starting M = 316
Starting M = 373
Starting M = 439
Starting M = 518
Starting M = 611
Starting M = 720
Starting M = 848
Starting M = 1000
Completed trial 2/10
Starting trial 3/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Starting M = 228
Starting M = 268
Starting M = 316
Starting M = 373
Starting M = 439
Starting M = 518
Starting M = 611
Starting M = 720
Starting M = 848
Starting M = 1000
Completed trial 3/10
Starting trial 4/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Star

In [48]:
importlib.reload(EnsRFTheory)
importlib.reload(LearningCurveExperiments)

<module 'LearningCurveExperiments' from '/n/home07/bruben/Simulations/Ensemble_DeepLearning/RandomFeatures/LearningCurveExperiments.py'>

In [49]:
# Define the subset size and file directory
subset_size = 30000  # Set your specific subset size
save_dir = "KernelSpectra"

# Load the saved dictionary for the given subset size
with open(os.path.join(save_dir, f'CIFAR_kernel_results_{subset_size}.pkl'), 'rb') as f:
    results = pickle.load(f)

# Retrieve eigenvalues, wbar, and sigma_eps from the dictionary
eigvals = results['eigvals'].reshape(-1)
wbar = results['wbar'].reshape(-1)
sigma_eps = results['sigma_eps']

# Compute theoretical learning curves using the retrieved eigenvalues, wbar, and sigma_eps
test_errors_theory = LearningCurveExperiments.compute_theoretical_learning_curves_ell(
    Sigma=eigvals,       # Sigma corresponds to the kernel eigenvalues
    wbar=wbar,           # Ground truth weights from CIFAR-10 task
    ell_list=ell_list,   # List of ell values
    lamVals=lamVals,     # Ridge regularization values
    P=P,                 # Fixed training sample size
    M_list=M_list,       # List of total number of random features
    sigma_eps=sigma_eps  # Noise level in the task
)

# Print or save the theoretical learning curves as needed
print("Theoretical learning curves computed successfully.")

Theoretical learning curves computed successfully.


In [50]:
# Define the experiment output and parameters as a dictionary
experiment_results = {
    'num_trials': num_trials,
    'ell_list': ell_list,
    'M_list': M_list,
    'lamVals': lamVals,
    'P': P,
    'PTest': PTest,
    'nonlinearity': nonlinearity,
    'class_groups': class_groups,  # Class groups for CIFAR-10 binarization
    'test_errors': test_errors,          # Numerical results from random feature models
    'test_errors_theory': test_errors_theory  # Theoretical results
}

# Create a descriptive filename with a timestamp to avoid overwriting
M_min = int(M_list[0])
M_max = int(M_list[-1])
filename = f'RF_CIFAR_M_{M_min}-{M_max}_P_{P}_trials_{num_trials}.pkl'

# Define the save path
save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

# Ensure the directory exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the dictionary as a pickle file
with open(os.path.join(save_path, filename), 'wb') as f:
    pickle.dump(experiment_results, f)

print(f"Experiment saved to {os.path.join(save_path, filename)}")


Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_CIFAR_M_100-1000_P_50000_trials_10.pkl


# MNIST Experiment Sweeping $M$, $\ell$

In [51]:
data_root = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Datasets/MNIST'  # Location to store MNIST dataset
class_groups = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]  # Group digits 0-4 and 5-9

In [52]:
# Load binarized MNIST data
X_train_np, y_train_np, X_test_np, y_test_np = DatasetMaker.get_binarized_MNIST(
    data_root, class_groups, flatten=True, normalize=True
)

In [53]:
D = X_train_np.shape[1]
Fvar = 2/D

In [54]:
# Convert data to torch tensors
X_train = torch.tensor(X_train_np, dtype=torch.float64).cuda()
y_train = torch.tensor(y_train_np, dtype=torch.float64).cuda().reshape(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float64).cuda()
y_test = torch.tensor(y_test_np, dtype=torch.float64).cuda().reshape(-1, 1)

In [55]:
num_trials = 10
M_list = np.unique(np.round(np.logspace(2, 3, 15))).astype(int)  # M values from 1e3 to 1e4
ell_list = np.linspace(.1, 1, 10)  # 11 values between 0 and 1
lamVals = np.logspace(-4, 2, 50)
P = 50000  # Fixed training sample size -- use the whole training set
PTest = 10000  #Use the whole test set.
nonlinearity = torch.relu  # ReLU random features

# Data root and class groups for CIFAR-10 binarization
data_root = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Datasets/MNIST'  # Location to store MNIST dataset
class_groups = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]  # Group digits 0-4 and 5-9

# Load and preprocess CIFAR-10 data
X_train_np, y_train_np, X_test_np, y_test_np = DatasetMaker.get_binarized_MNIST(
    data_root, class_groups, flatten=True, normalize=True
)

D = X_train_np.shape[1]
Fvar = 2 / D  # Variance for random features

# Convert data to torch tensors and move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train = torch.tensor(X_train_np, dtype=torch.float64).to(device)
y_train = torch.tensor(y_train_np, dtype=torch.float64).to(device).reshape(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float64).to(device)
y_test = torch.tensor(y_test_np, dtype=torch.float64).to(device).reshape(-1, 1)

# Ensure that the training data is large enough for all trials
# total_train_samples = P * num_trials
# if X_train.shape[0] < total_train_samples:
#     # Optionally, you can augment the training data here
#     raise ValueError("Not enough training samples for the specified number of trials.")

In [56]:
# Run experiments using the new ell sweep function
test_errors = LearningCurveExperiments.train_random_feature_models_ell(
    X_train, y_train, X_test, y_test,
    num_trials, ell_list, M_list, P, lamVals,
    nonlinearity=nonlinearity, Fvar=Fvar
)

Starting trial 1/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Starting M = 228
Starting M = 268
Starting M = 316
Starting M = 373
Starting M = 439
Starting M = 518
Starting M = 611
Starting M = 720
Starting M = 848
Starting M = 1000
Completed trial 1/10
Starting trial 2/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Starting M = 228
Starting M = 268
Starting M = 316
Starting M = 373
Starting M = 439
Starting M = 518
Starting M = 611
Starting M = 720
Starting M = 848
Starting M = 1000
Completed trial 2/10
Starting trial 3/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Starting M = 228
Starting M = 268
Starting M = 316
Starting M = 373
Starting M = 439
Starting M = 518
Starting M = 611
Starting M = 720
Starting M = 848
Starting M = 1000
Completed trial 3/10
Starting trial 4/10
Starting M = 100
Starting M = 118
Starting M = 139
Starting M = 164
Starting M = 193
Star

In [57]:
importlib.reload(EnsRFTheory)
importlib.reload(LearningCurveExperiments)

<module 'LearningCurveExperiments' from '/n/home07/bruben/Simulations/Ensemble_DeepLearning/RandomFeatures/LearningCurveExperiments.py'>

In [58]:
# Define the subset size and file directory
subset_size = 30000  # Set your specific subset size
save_dir = "KernelSpectra"

# Load the saved dictionary for the given subset size
with open(os.path.join(save_dir, f'MNIST_kernel_results_{subset_size}.pkl'), 'rb') as f:
    results = pickle.load(f)

# Retrieve eigenvalues, wbar, and sigma_eps from the dictionary
eigvals = results['eigvals'].reshape(-1)
wbar = results['wbar'].reshape(-1)
sigma_eps = results['sigma_eps']

# Compute theoretical learning curves using the retrieved eigenvalues, wbar, and sigma_eps
test_errors_theory = LearningCurveExperiments.compute_theoretical_learning_curves_ell(
    Sigma=eigvals,       # Sigma corresponds to the kernel eigenvalues
    wbar=wbar,           # Ground truth weights from CIFAR-10 task
    ell_list=ell_list,   # List of ell values
    lamVals=lamVals,     # Ridge regularization values
    P=P,                 # Fixed training sample size
    M_list=M_list,       # List of total number of random features
    sigma_eps=sigma_eps  # Noise level in the task
)

# Print or save the theoretical learning curves as needed
print("Theoretical learning curves computed successfully.")

Theoretical learning curves computed successfully.


In [59]:
# Define the experiment output and parameters as a dictionary
experiment_results = {
    'num_trials': num_trials,
    'ell_list': ell_list,
    'M_list': M_list,
    'lamVals': lamVals,
    'P': P,
    'PTest': PTest,
    'nonlinearity': nonlinearity,
    'class_groups': class_groups,  # Class groups for CIFAR-10 binarization
    'test_errors': test_errors,          # Numerical results from random feature models
    'test_errors_theory': test_errors_theory  # Theoretical results
}

# Create a descriptive filename with a timestamp to avoid overwriting
M_min = int(M_list[0])
M_max = int(M_list[-1])
filename = f'RF_MNIST_M_{M_min}-{M_max}_P_{P}_trials_{num_trials}.pkl'

# Define the save path
save_path = '/n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features'

# Ensure the directory exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the dictionary as a pickle file
with open(os.path.join(save_path, filename), 'wb') as f:
    pickle.dump(experiment_results, f)

print(f"Experiment saved to {os.path.join(save_path, filename)}")

Experiment saved to /n/holystore01/LABS/pehlevan_lab/Lab/bruben/Ensemble_Random_Features/RF_MNIST_M_100-1000_P_50000_trials_10.pkl
