# Week 9: Scaling Laws & Emergent Behaviors

## Strategy: Scaling the Ensemble
Module 20 focuses on Scaling Laws. In BBO with sparse data (18 points), we cannot scale data, but we can scale **Model Capacity** and **Compute**.
1. **Massive Ensembling:** We scale from 3 models to **20 Neural Networks** (Bagging). According to scaling laws, this should linearly reduce the variance of our uncertainty estimate.
2. **Emergent Robustness:** We look for emergent stability in high-dimensional functions (Func 8) where smaller ensembles failed to converge.
3. **Repulsion & Trust Regions:** We maintain the Week 7 fixes to prevent boundary saturation.

In [1]:
import numpy as np
import warnings
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
import sys
import os

# Ensure we can import from src
sys.path.append(os.path.abspath('..'))
from src.utils import load_data

warnings.filterwarnings("ignore")
np.random.seed(49) # Week 9 Seed

print("Ready for Scaled Optimization")

Ready for Scaled Optimization


In [2]:
def suggest_next_point_scaled(func_id, X_train, y_train):
    print(f"--- Optimizing Function {func_id} (Scaled Ensemble N=20) ---")
    
    # 1. Preprocessing
    scaler_x = StandardScaler()
    X_scaled = scaler_x.fit_transform(X_train)
    scaler_y = StandardScaler()
    y_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
    
    # 2. Define Base Model (Architecture from Week 7 Tuning)
    # We use a robust architecture found previously to save compute time during bagging
    if func_id in [7, 8]:
        hidden_layers = (128, 64)
        alpha = 0.01
    else:
        hidden_layers = (64, 32)
        alpha = 0.01
        
    base_mlp = MLPRegressor(hidden_layer_sizes=hidden_layers, alpha=alpha, 
                            activation='tanh', solver='lbfgs', max_iter=2000)
    
    # 3. SCALING: Train 20 models (Bagging)
    # This mimics "scaling up" compute to improve robustness
    regr = BaggingRegressor(estimator=base_mlp, n_estimators=20, 
                            random_state=42, n_jobs=-1)
    regr.fit(X_scaled, y_scaled)
    
    # GP Anchor (Fixed)
    kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
    gp_model = GaussianProcessRegressor(kernel=kernel, normalize_y=False)
    gp_model.fit(X_scaled, y_scaled)
    
    # 4. Objective Function
    best_idx = np.argmax(y_train)
    x_start_original = X_train[best_idx]
    x_start_scaled = scaler_x.transform(x_start_original.reshape(1, -1)).flatten()
    
    def objective_function(x):
        x_reshaped = x.reshape(1, -1)
        
        # Get predictions from all 20 estimators
        # BaggingRegressor doesn't expose individual predictions easily in one call,
        # so we iterate. This is computationally expensive but fits the "Scaling" theme.
        nn_preds = np.array([est.predict(x_reshaped)[0] for est in regr.estimators_])
        
        avg_nn = np.mean(nn_preds)
        std_nn = np.std(nn_preds)
        
        gp_pred, gp_std = gp_model.predict(x_reshaped, return_std=True)
        gp_pred = gp_pred[0]
        gp_std = gp_std[0]
        
        # Combined UCB
        comb_mean = 0.6 * avg_nn + 0.4 * gp_pred
        comb_std = 0.6 * std_nn + 0.4 * gp_std
        
        # Exploration parameter
        kappa = 1.96
        ucb = comb_mean + kappa * comb_std
        
        # Repulsion Penalty
        dist_sq = np.sum((x_reshaped - x_start_scaled)**2)
        penalty = 10.0 * np.exp(-dist_sq / (2 * 0.1**2))
        
        return -ucb + penalty

    # 5. Trust Region Optimization
    radius = 0.2
    bounds_scaled = []
    for i in range(X_train.shape[1]):
        mean, scale = scaler_x.mean_[i], scaler_x.scale_[i]
        curr_val = x_start_original[i]
        lower = (max(0.0, curr_val - radius) - mean) / scale
        upper = (min(1.0, curr_val + radius) - mean) / scale
        bounds_scaled.append((lower, upper))
    
    # Perturbed start
    x_init = x_start_scaled + np.random.uniform(-0.1, 0.1, size=x_start_scaled.shape)
    
    res = minimize(fun=objective_function, x0=x_init, method='L-BFGS-B', 
                   bounds=bounds_scaled, options={'maxiter': 100})
    
    return np.clip(scaler_x.inverse_transform(res.x.reshape(1, -1)).flatten(), 0.0, 1.0)

In [3]:
submission_queries = {}
print(f"{'Func':<5} | {'Optimizing...'}")
print("-" * 30)

for func_id in range(1, 9):
    # Ensure you have updated data to 18 points (10+8) before running
    X_known, y_known = load_data(func_id)
    next_x = suggest_next_point_scaled(func_id, X_known, y_known)
    submission_queries[func_id] = next_x

print("\n" + "="*30)
print("FORMATTED SUBMISSION OUTPUT")
print("="*30)

for func_id, x_val in submission_queries.items():
    formatted_str = "-".join([f"{val:.6f}" for val in x_val])
    print(f"function_number: {func_id}: {formatted_str}")


FORMATTED SUBMISSION OUTPUT
function_number: 1: 0.531024-0.533000
function_number: 2: 0.868886-1.000000
function_number: 3: 0.342138-0.774352-0.617345
function_number: 4: 0.220631-0.401128-0.452145-0.431712
function_number: 5: 1.000000-0.892988-1.000000-1.000000
function_number: 6: 0.288748-0.378009-0.608719-0.969796-0.152636
function_number: 7: 0.000000-0.243627-0.458460-0.050169-0.156875-0.888096
function_number: 8: 0.000000-0.000000-0.000000-0.013207-0.274043-0.202089-0.000000-0.467955
