# LassoNet Tuning (ohne Wiederholungsfits, ρ=0.6)

**Änderungen gemäß Vorgabe:**  
- **Keine Wiederholungsfits** (nur ein Fit pro Konfiguration).  
- **ρ = 0.6** in der Datengenerierung (Simulation‑1‑Stil).  
- **Kein Stabilitätskriterium**.  
- **Auswahlregel:** *minimiere mBIC2*, dann *mBIC*, dann *Laufzeit*, dann *k*.  
- **Laufzeit-Filter:** Ein Fit wird **nur berücksichtigt**, wenn seine Laufzeit `< 20s` ist.

> Hinweis: Dieses Notebook erwartet ein `model_selection.py` im Arbeitsverzeichnis, das `lassonet` bereitstellt und die Parameter (`M`, `path_multiplier`, `hidden_dims`, `n_iters`, `early_patience`, `random_state`) akzeptiert.


In [9]:

# Imports & helpers
import os, json, time, random
from itertools import product
import numpy as np

# Import user module (assumes model_selection.py is on the Python path / cwd)
from model_selection import lassonet, ModelSelResult

# --- Search space ---
SPACE = {
    "M": [10, 20, 30, 40, 60],
    "path_multiplier": [1.04, 1.06, 1.08, 1.12, 1.20],
    "hidden_dims": [(4,), (8,), (16,)],
    "n_iters": [80, 100, 150],
    "early_patience": [2, 3, 5],   # Name passend zu deiner Funktion
}

def all_combinations(space):
    keys = list(space.keys())
    vals = list(space.values())
    return [dict(zip(keys, tup)) for tup in product(*vals)], keys


## Daten erzeugen (Simulation 1 Stil; ρ=0.6)

In [10]:

from scipy.linalg import toeplitz
from sklearn.preprocessing import scale

# Simulation-1-Parameter (ohne Wiederholungen)
n = 500
p = 1000
rho = 0.6        # <--- Vorgabe
k_true = 10      # Kann manuell geändert werden
seed_data = 19091303

rng = np.random.default_rng(seed_data)

def ar1_cor(p, rho):
    return toeplitz(rho ** np.arange(p))

# Wahres Modell
p_idx = np.arange(p)
true_support = rng.choice(p_idx, k_true, replace=False) if k_true > 0 else np.array([], dtype=int)
beta = np.zeros(p)
if k_true > 0:
    beta[true_support] = rng.normal(0, 1, size=k_true)

# AR(1) Kovarianz und Daten
Sigma = ar1_cor(p, rho)
X_raw = rng.multivariate_normal(mean=np.zeros(p), cov=Sigma, size=n)
y_raw = X_raw @ beta + rng.normal(0, 1, size=n)

# Skalierung wie in deiner Simulation: X / sqrt(n), y skaliert
X = np.ascontiguousarray(scale(X_raw) / np.sqrt(n), dtype=np.float64)
y = np.ascontiguousarray(scale(y_raw), dtype=np.float64)

print("X shape:", X.shape, " y shape:", y.shape, "rho:", rho, "k_true:", k_true)


X shape: (500, 1000)  y shape: (500,) rho: 0.6 k_true: 10


## Evaluationslogik (ohne Seeds; Laufzeit-Filter < 20s)

In [11]:

RUNTIME_LIMIT = 20.0  # Sekunden

def evaluate_config_once(X, y, cfg, model="linear"):
    """Ein einzelner Fit für eine Konfiguration; gibt Score & Laufzeit zurück.

    Auswahlkennzahlen:

    - score2: mBIC2 (Primär)

    - score1: mBIC  (Sekundär)

    - time_sec

    - k (Supportgröße passend zum primären Score)

    """

    t0 = time.time()

    res: ModelSelResult = lassonet(

        y, X, model=model,

        M=cfg["M"],

        path_multiplier=cfg["path_multiplier"],

        hidden_dims=cfg["hidden_dims"],

        n_iters=cfg["n_iters"],

        early_patience=cfg["early_patience"],

        random_state=0,

    )

    dt = time.time() - t0

    # Primär mBIC2, sonst mBIC

    score2 = float(res.mBIC2)

    score1 = float(res.mBIC)

    use_mBIC  = False

    if not np.isfinite(score2):

        use_mBIC = True

    supp = (np.asarray(res.model2 if not use_mBIC else res.model1, dtype=int) - 1)

    k = int(len(supp))

    return {

        **cfg,

        "mBIC2": score2,

        "mBIC": score1,

        "time_sec": float(dt),

        "k": k,

    }



def better(a, b):

    """Vergleichsregel: mBIC2 -> mBIC -> Laufzeit -> k (alle *kleiner* besser)."""

    # Primär mBIC2 (Inf/NaN gilt als schlechter)

    a2 = a["mBIC2"] if np.isfinite(a["mBIC2"]) else np.inf

    b2 = b["mBIC2"] if np.isfinite(b["mBIC2"]) else np.inf

    if a2 < b2 - 1e-9: return True

    if abs(a2 - b2) <= 1e-9:

        # Sekundär mBIC


        a1 = a["mBIC"] if np.isfinite(a["mBIC"]) else np.inf

        b1 = b["mBIC"] if np.isfinite(b["mBIC"]) else np.inf

        if a1 < b1 - 1e-9: return True

        if abs(a1 - b1) <= 1e-9:

            # Dann Laufzeit


            if a["time_sec"] < b["time_sec"] - 1e-9: return True

            if abs(a["time_sec"] - b["time_sec"]) <= 1e-9:

                # Dann kleinere k


                if a["k"] < b["k"]: return True

    return False



## Tuning ausführen (ein Datensatz; keine Wiederholungsfits)

In [12]:

# Laufkonfiguration
model = "linear"                  # "linear" oder "logistic"
max_configs = 40                  # zufällige Teilmenge aus dem vollen Grid (675)
global_seed = 12345

# Kandidaten
all_cands, keys = all_combinations(SPACE)
rnd = random.Random(global_seed)
rnd.shuffle(all_cands)
candidates = all_cands[:min(max_configs, len(all_cands))]

print(f"Geplante Konfigurationen: {len(candidates)} von {len(all_cands)} (voll = 675)")

# Evaluieren
rows = []
best = None
under_limit = 0
for i, cfg in enumerate(candidates, 1):
    rec = evaluate_config_once(X, y, cfg, model=model)
    rows.append(rec)
    # Nur Fits berücksichtigen, deren Laufzeit < 20s ist
    if rec["time_sec"] < RUNTIME_LIMIT:
        under_limit += 1
        if best is None or better(rec, best):
            best = rec
    print(f"[{i:02d}] time={rec['time_sec']:.3f}s  mBIC2={rec['mBIC2']:.3f}  mBIC={rec['mBIC']:.3f}  k={rec['k']}")

print(f"Unter Laufzeitlimit (<{RUNTIME_LIMIT}s): {under_limit} von {len(candidates)} geprüften Konfigurationen.")
if best is None:
    print("Kein Fit unter Laufzeitlimit. Bitte max_configs reduzieren oder Parameterraum anpassen.")
else:
    print("\nBeste Konfiguration (unter Laufzeitlimit):")
    print({k: best[k] for k in keys})
    print({k: best[k] for k in ['mBIC2','mBIC','time_sec','k']})


Geplante Konfigurationen: 40 von 675 (voll = 675)


NameError: name 'BATCH_SIZE' is not defined

## Ergebnisse speichern

In [None]:

import pandas as pd
out_dir = "tune_results_rho_0_6"
os.makedirs(out_dir, exist_ok=True)
df = pd.DataFrame(rows)
df.to_csv(os.path.join(out_dir, "lassonet_tuning_results.csv"), index=False)
# Bestes Ergebnis (falls vorhanden)
best_json = None
try:
    best_json = {
        "best_cfg": {k: best[k] for k in keys},
        "mBIC2": float(best["mBIC2"]),
        "mBIC": float(best["mBIC"]),
        "time_sec": float(best["time_sec"]),
        "k": int(best["k"]),
        "rho": float(rho),
        "n": int(n),
        "p": int(p),
        "k_true": int(k_true),
    } if best is not None else {"note": "no_config_under_time_limit"}
except Exception:
    best_json = {"note": "no_config_under_time_limit"}
with open(os.path.join(out_dir, "lassonet_best_config.json"), "w", encoding="utf-8") as f:
    json.dump(best_json, f, ensure_ascii=False, indent=2)
print("Gespeichert:", os.path.join(out_dir, "lassonet_tuning_results.csv"))
print("Gespeichert:", os.path.join(out_dir, "lassonet_best_config.json"))


## Optional: Batch über k ∈ {0,10,40} (je 1 Simulation, kein Seed-Repeat)

In [None]:

# Batchlauf – erzeugt pro k einen neuen Datensatz (kein Wiederholungsfit), wendet dieselbe Logik an.
k_list = [0, 10, 40]
batch_rows = []
for k_val in k_list:
    # Daten neu erzeugen
    rng = np.random.default_rng(seed_data + k_val)  # deterministisch pro k
    p_idx = np.arange(p)
    true_support = rng.choice(p_idx, k_val, replace=False) if k_val > 0 else np.array([], dtype=int)
    beta = np.zeros(p)
    if k_val > 0:
        beta[true_support] = rng.normal(0, 1, size=k_val)
    Sigma = toeplitz(rho ** np.arange(p))
    X_raw = rng.multivariate_normal(mean=np.zeros(p), cov=Sigma, size=n)
    y_raw = X_raw @ beta + rng.normal(0, 1, size=n)
    Xk = np.ascontiguousarray(scale(X_raw) / np.sqrt(n), dtype=np.float64)
    yk = np.ascontiguousarray(scale(y_raw), dtype=np.float64)

    # Kandidaten auswählen
    rnd = random.Random(global_seed + k_val)
    cands = all_cands.copy()
    rnd.shuffle(cands)
    cands = cands[:min(max_configs, len(cands))]

    # Evaluieren
    best_k = None
    under_limit_k = 0
    for cfg in cands:
        rec = evaluate_config_once(Xk, yk, cfg, model=model)
        if rec["time_sec"] < RUNTIME_LIMIT:
            under_limit_k += 1
            if best_k is None or better(rec, best_k):
                best_k = rec
    batch_rows.append({
        "k_true": k_val,
        "under_limit": under_limit_k,
        **({k: best_k[k] for k in keys} if best_k is not None else {k: None for k in keys}),
        **({m: best_k[m] for m in ["mBIC2","mBIC","time_sec","k"]} if best_k is not None else {"mBIC2": None, "mBIC": None, "time_sec": None, "k": None}),
    })

import pandas as pd
df_batch = pd.DataFrame(batch_rows)
display(df_batch)
df_batch.to_csv(os.path.join(out_dir, "batch_k_0_10_40.csv"), index=False)
print("Gespeichert:", os.path.join(out_dir, "batch_k_0_10_40.csv"))
