# Depuración de activaciones para L1 / L2
Este notebook ejecuta experimentos controlados (un paso de entrenamiento) para comparar cómo afectan L1 y L2 con pesos extremos a las activaciones ocultas y al número de estados únicos.
Usará utilidades existentes en `src.analysis` para contar estados únicos (rounding + np.unique).

In [3]:
# Imports y helpers
import os, sys
ROOT = os.path.abspath(os.path.join(os.getcwd()))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
PARENT_ROOT = os.path.abspath(os.path.join(ROOT, '..'))
if PARENT_ROOT not in sys.path:
    sys.path.insert(0, PARENT_ROOT)
import torch
import numpy as np
import pandas as pd
from src.utils.seed import set_seed
from src.models import build_model
from src.regularizers.l1 import L1Regularizer
from src.regularizers.l2 import L2Regularizer
from src.datasets.fashion_mnist import build_fashion_mnist
from src.analysis.analysis import count_unique_states_rounding

device = torch.device('cuda')

def collect_hidden_activations(model, loader, max_samples=2000, device=torch.device('cpu')):
    model.eval()
    acts = []
    with torch.no_grad():
        seen = 0
        for xb, yb in loader:
            xb = xb.to(device)
            # model may expect (N,1,28,28) for FashionMNIST; SimpleFFW flattens
            if hasattr(model, 'get_hidden'):
                h = model.get_hidden(xb)
            else:
                out = model(xb, return_features=True)
                if isinstance(out, tuple):
                    _, h = out
                else:
                    # fallback: take output as features
                    h = out
            h = h.detach().cpu().numpy()
            acts.append(h)
            seen += h.shape[0]
            if seen >= max_samples:
                break
    if not acts:
        return np.empty((0,0))
    X = np.vstack(acts)[:max_samples]
    return X

def run_one(reg_type=None, weight=0.0, seed=42, hidden_dim=64, sample_limit=2000):
    set_seed(seed)
    # build small model for speed
    m = build_model('simple_ffw', dataset='fashion_mnist', input_dim=28*28, hidden_dim=hidden_dim, num_classes=10, dropout=0.0)
    m.to(device)

    # prepare data (small subset)
    data_dir = os.path.join(ROOT, 'data')
    train_loader, test_loader = build_fashion_mnist(data_dir=data_dir, batch_size=128, train_split=1.0, subset_ratio=0.05, test_subset_ratio=0.05, seed=seed)
    # pick a deterministic small batch for the single training step
    xb, yb = next(iter(train_loader))
    xb, yb = xb.to(device), yb.to(device)

    # compute parameter norms before step
    with torch.no_grad():
        param_l1_before = sum(p.abs().sum().item() for p in m.parameters())
        param_l2_before = sum((p**2).sum().item() for p in m.parameters())

    opt = torch.optim.SGD(m.parameters(), lr=0.01)
    loss_fn = torch.nn.CrossEntropyLoss()

    # regularizer penalty
    penalty = torch.tensor(0.0, dtype=torch.float32)
    if reg_type == 'l1':
        reg = L1Regularizer(weight=weight)
        penalty = reg.penalty(m)
    elif reg_type == 'l2':
        reg = L2Regularizer(weight=weight)
        penalty = reg.penalty(m)

    # one forward/backward/step
    m.train()
    out = m(xb)
    if isinstance(out, tuple):
        out = out[0]
    loss = loss_fn(out, yb)
    total = loss + penalty.to(loss.device)
    opt.zero_grad()
    total.backward()
    opt.step()

    # compute parameter norms after step
    with torch.no_grad():
        param_l1_after = sum(p.abs().sum().item() for p in m.parameters())
        param_l2_after = sum((p**2).sum().item() for p in m.parameters())

    # collect hidden activations from test set (or subset)
    X = collect_hidden_activations(m, test_loader, max_samples=sample_limit, device=device)
    if X.size == 0:
        n_states = 0
    else:
        n_states, reps = count_unique_states_rounding(X, decimals=5, normalize_l2=True)

    return {
        'reg_type': reg_type or 'baseline',
        'weight': weight,
        'n_states': int(n_states),
        'param_l1_before': float(param_l1_before),
        'param_l2_before': float(param_l2_before),
        'param_l1_after': float(param_l1_after),
        'param_l2_after': float(param_l2_after),
    }

# quick smoke run to verify everything loads
print('Imports OK, device=', device)

Imports OK, device= cuda


In [4]:
# Run experiments for selected configs
configs = [
    ('baseline', None, 0.0),
    ('l1', 1.0, 1.0),
    ('l1', 0.1, 0.1),
    ('l2', 1.0, 1.0),
    ('l2', 0.1, 0.1),
]
results = []
for name, rtype, w in configs:
    print('Running', name, rtype, w)
    if rtype is None:
        res = run_one(reg_type=None, weight=0.0, seed=1234, hidden_dim=64, sample_limit=1200)
    else:
        res = run_one(reg_type=rtype, weight=w, seed=1234, hidden_dim=64, sample_limit=1200)
    results.append(res)

df = pd.DataFrame(results)
print()
print(df)

Running baseline None 0.0
Running l1 1.0 1.0
Running l1 0.1 0.1
Running l2 1.0 1.0
Running l2 0.1 0.1

   reg_type  weight  n_states  param_l1_before  param_l2_before  \
0  baseline     0.0       500       937.440515         24.72053   
1       1.0     1.0       500       937.440515         24.72053   
2       0.1     0.1       500       937.440515         24.72053   
3       1.0     1.0       500       937.440515         24.72053   
4       0.1     0.1       500       937.440515         24.72053   

   param_l1_after  param_l2_after  
0      937.417711        24.71788  
1      937.417711        24.71788  
2      937.417711        24.71788  
3      937.417711        24.71788  
4      937.417711        24.71788  
