# Prep_Results_Sim1
Dieses Notebook lädt die in `Results1/` (oder `CheckResults1/`) abgelegten Simulationsergebnisse
aus **Sim1** und bereitet Aggregat- und Long-Form-Tabellen analog zum bereitgestellten R‑Skript auf.

**Was entsteht:**
- `df` (aggregierte Metriken pro (k, rho, Methode))
- `df_mBIC`, `df_mBIC2`, `df_Runtime` (Long‑Format auf Instanzebene)
- Wide-Tabellen via `pivot_table` analog zu `dcast(...)`:
  - `tab_Better`, `tab_Better2`, `tab_Worse`, `tab_Worse2`
  - `tabFP`, `tabFDR2`, `tabPower`, `tabPower2`
  - `tab_m`, `tabsem`, `tab_m2`, `tabsem2`, `tab_runtime`
  
Alle Tabellen können optional als CSV gespeichert werden.


In [None]:

import os
import glob
import pickle
import numpy as np
import pandas as pd

# Anzeigeoptionen
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)


## Konfiguration & Helper

In [None]:

# Suchreihenfolge der Ergebnisordner
RESULTS_DIRS = ["Results1", "CheckResults1"]
RESULTS_DIR = next((d for d in RESULTS_DIRS if os.path.isdir(d)), RESULTS_DIRS[0])
print("Nutze Ergebnisordner:", RESULTS_DIR)

# Szenario-Raster (nur für Ausgabe/Pivots; Dateien werden dynamisch gefunden)
k_vec = [0, 5, 10, 20, 40]
rho_vec = [0.0, 0.5, 0.8]

# Schwelle wie im R-Code (numerische Toleranz)
EPS = 1e-8

def load_all_pickles(results_dir):
    """Lädt alle Pickle-Dateien 'Sim1.k_<k>.rho_<rho>.pkl' aus results_dir."""
    pattern = os.path.join(results_dir, "Sim1.k_*.rho_*.pkl")
    files = sorted(glob.glob(pattern))
    if not files:
        print("Keine Dateien gefunden unter", pattern)
    items = []
    for fn in files:
        try:
            with open(fn, 'rb') as f:
                data = pickle.load(f)
            # Extrahiere k und rho aus Dateinamen zur Sicherheit
            base = os.path.basename(fn)
            # Format: Sim1.k_<k>.rho_<rho>.pkl
            k_str = base.split('k_')[1].split('.rho_')[0]
            rho_str = base.split('rho_')[1].split('.pkl')[0]
            k = int(k_str)
            rho = float(rho_str)
            data['k'] = int(data.get('k', k))
            data['rho'] = float(data.get('rho', rho))
            data['__file__'] = fn
            items.append(data)
        except Exception as e:
            print(f"Fehler beim Laden von {fn}: {e}")
    return items

def as_df_long_diff(diff_mat, method_names, instance_offset, scenario_id, value_name):
    """Formt eine (sim_nr x nr_methods)-Matrix in Long-Format mit Instance/Scenario um."""
    sim_nr, nr_methods = diff_mat.shape
    df = pd.DataFrame(diff_mat, columns=method_names)
    df['instance'] = np.arange(sim_nr) + instance_offset
    df['scenario'] = scenario_id
    long_df = df.melt(id_vars=['instance', 'scenario'], var_name='variable', value_name=value_name)
    return long_df

def sem_from_sd(sd, n):
    return sd / np.sqrt(max(n, 1))


## Laden & Aufbereiten (analog R)

In [None]:

all_data = load_all_pickles(RESULTS_DIR)

# Container für Aggregationen
df_rows = []
df_mBIC_list = []
df_mBIC2_list = []
df_Runtime_list = []

instance_offset = 0
scenario_id = 0

for block in all_data:
    mBIC_results  = np.asarray(block['mBIC_results'])
    mBIC2_results = np.asarray(block['mBIC2_results'])
    mBIC_FP       = np.asarray(block['mBIC_FP'])
    mBIC2_FP      = np.asarray(block['mBIC2_FP'])
    mBIC_TP       = np.asarray(block['mBIC_TP'])
    mBIC2_TP      = np.asarray(block['mBIC2_TP'])
    runtime       = np.asarray(block['runtime'])
    method_names  = list(block['method_names'])
    k = int(block['k'])
    rho = float(block['rho'])
    
    sim_nr, nr_methods = mBIC_results.shape
    # Baseline = erste Methode (stepwise_plain)
    stepwise = mBIC_results[:, [0]]
    stepwise2 = mBIC2_results[:, [0]]
    
    res  = mBIC_results  - stepwise      # Differenzen zu Baseline (mBIC)
    res2 = mBIC2_results - stepwise2     # Differenzen zu Baseline (mBIC2)
    
    # Zähl- und Gütemaße analog R
    Worse  = (res  >  EPS).sum(axis=0)
    Better = (res  < -EPS).sum(axis=0)
    Worse2 = (res2 >  EPS).sum(axis=0)
    Better2= (res2 < -EPS).sum(axis=0)
    
    FP    = mBIC_FP.mean(axis=0)
    TP    = mBIC_TP.mean(axis=0)
    Power = (TP / k) if k > 0 else np.zeros_like(TP)
    
    FP2   = mBIC2_FP.mean(axis=0)
    TP2   = mBIC2_TP.mean(axis=0)
    Power2= (TP2 / k) if k > 0 else np.zeros_like(TP2)
    
    # FDR2 = mean over sims of FP/(FP+TP); hier: erst pro Sim, dann Mittelwert
    with np.errstate(divide='ignore', invalid='ignore'):
        fdr2_sim = mBIC2_FP / (mBIC2_FP + mBIC2_TP)
        fdr2_sim = np.where(np.isnan(fdr2_sim), 0.0, fdr2_sim)
    FDR2 = fdr2_sim.mean(axis=0)
    
    mean_mBIC  = res.mean(axis=0)
    mean_mBIC2 = res2.mean(axis=0)
    sd_mBIC    = res.std(axis=0, ddof=1)
    sd_mBIC2   = res2.std(axis=0, ddof=1)
    sem_mBIC   = sd_mBIC  / np.sqrt(sim_nr)
    sem_mBIC2  = sd_mBIC2 / np.sqrt(sim_nr)
    
    Runtime = runtime.mean(axis=0)
    
    # Aggregierte Zeilen pro Methode
    for l, mname in enumerate(method_names):
        df_rows.append({
            'k': k, 'rho': rho, 'method': mname,
            'Better': int(Better[l]), 'Worse': int(Worse[l]),
            'Power': float(Power[l]), 'FP': float(FP[l]),
            'Better2': int(Better2[l]), 'Worse2': int(Worse2[l]),
            'Power2': float(Power2[l]), 'FDR2': float(FDR2[l]),
            'mean_mBIC': float(mean_mBIC[l]), 'sd_mBIC': float(sd_mBIC[l]), 'sem_mBIC': float(sem_mBIC[l]),
            'mean_mBIC2': float(mean_mBIC2[l]), 'sd_mBIC2': float(sd_mBIC2[l]), 'sem_mBIC2': float(sem_mBIC2[l]),
            'Runtime': float(Runtime[l]),
        })
    
    # Long-Form (Differenzen relativ zur Baseline)
    df_mBIC_list.append(as_df_long_diff(res, method_names, instance_offset, scenario_id, 'mBIC'))
    df_mBIC2_list.append(as_df_long_diff(res2, method_names, instance_offset, scenario_id, 'mBIC2'))
    
    # Runtime in Long-Form
    rt_df = pd.DataFrame(runtime, columns=method_names)
    rt_df['instance'] = np.arange(sim_nr) + instance_offset
    rt_df['scenario'] = scenario_id
    runtime_long = rt_df.melt(id_vars=['instance','scenario'], var_name='variable', value_name='Runtime')
    df_Runtime_list.append(runtime_long)
    
    # Fortschritt der Indizes
    instance_offset += sim_nr
    scenario_id += 1

# Gesamtdatensätze
df = pd.DataFrame(df_rows)
df_mBIC  = pd.concat(df_mBIC_list, ignore_index=True) if df_mBIC_list else pd.DataFrame(columns=['instance','scenario','variable','mBIC'])
df_mBIC2 = pd.concat(df_mBIC2_list, ignore_index=True) if df_mBIC2_list else pd.DataFrame(columns=['instance','scenario','variable','mBIC2'])
df_Runtime = pd.concat(df_Runtime_list, ignore_index=True) if df_Runtime_list else pd.DataFrame(columns=['instance','scenario','variable','Runtime'])

# Faktoren/Sortierung analog R (Methodenreihenfolge wie aus Dateien)
if not df.empty:
    # method ordering per scenario may vary; take union in first-seen order
    method_order = []
    for block in all_data:
        for m in block['method_names']:
            if m not in method_order:
                method_order.append(m)
    df['method'] = pd.Categorical(df['method'], categories=method_order, ordered=True)

# Typen angleichen
for col in ['instance','scenario']:
    if col in df_mBIC.columns:
        df_mBIC[col] = df_mBIC[col].astype('category')
    if col in df_mBIC2.columns:
        df_mBIC2[col] = df_mBIC2[col].astype('category')
    if col in df_Runtime.columns:
        df_Runtime[col] = df_Runtime[col].astype('category')

print("Fertig geladen. df shape:", df.shape)
df.head(10)


## Wide‑Tabellen (analog `dcast`)

In [None]:

# Hilfsfunktion für Pivots
def dcast_like(data, value_col, drop_methods=None):
    temp = data.copy()
    if drop_methods is not None and 'method' in temp.columns:
        temp = temp[~temp['method'].isin(drop_methods)]
    pt = temp.pivot_table(index=['k','rho'], columns='method', values=value_col, aggfunc='first').reset_index()
    # Spalten sortieren: k, rho, dann Methoden
    cols = ['k','rho'] + [c for c in pt.columns if c not in ('k','rho')]
    return pt[cols]

drop_stepwise = ['stepwise_plain']  # wie im R-Code

tab_Better  = dcast_like(df, 'Better',  drop_methods=drop_stepwise)
tab_Better2 = dcast_like(df, 'Better2', drop_methods=drop_stepwise)
tab_Worse   = dcast_like(df, 'Worse',   drop_methods=drop_stepwise)
tab_Worse2  = dcast_like(df, 'Worse2',  drop_methods=drop_stepwise)
tabFP       = dcast_like(df, 'FP')
tabFDR2     = dcast_like(df, 'FDR2')

# Power-Tabellen nur für k>0
df_posk = df[df['k'] > 0] if not df.empty else df
tabPower  = dcast_like(df_posk, 'Power')
tabPower2 = dcast_like(df_posk, 'Power2')

tab_m     = dcast_like(df, 'mean_mBIC')
tabsem    = dcast_like(df, 'sem_mBIC')
tab_m2    = dcast_like(df, 'mean_mBIC2')
tabsem2   = dcast_like(df, 'sem_mBIC2')
tab_runtime = dcast_like(df, 'Runtime')

tab_Better.head()


## (Optional) CSV‑Export

In [None]:

OUT_DIR = os.path.join(RESULTS_DIR, "Prepared")
os.makedirs(OUT_DIR, exist_ok=True)

def save_csv(df, name):
    fn = os.path.join(OUT_DIR, f"{name}.csv")
    df.to_csv(fn, index=False)
    print("Gespeichert:", fn)

save_csv(df, "df_aggregate")
save_csv(df_mBIC, "df_mBIC_long")
save_csv(df_mBIC2, "df_mBIC2_long")
save_csv(df_Runtime, "df_Runtime_long")

save_csv(tab_Better, "tab_Better")
save_csv(tab_Better2, "tab_Better2")
save_csv(tab_Worse, "tab_Worse")
save_csv(tab_Worse2, "tab_Worse2")
save_csv(tabFP, "tabFP")
save_csv(tabFDR2, "tabFDR2")
save_csv(tabPower, "tabPower")
save_csv(tabPower2, "tabPower2")
save_csv(tab_m, "tab_m")
save_csv(tabsem, "tabsem")
save_csv(tab_m2, "tab_m2")
save_csv(tabsem2, "tabsem2")
save_csv(tab_runtime, "tab_runtime")


## Vorschau einiger Tabellen

In [None]:

display(df.head(20))
display(tab_Better.head(10))
display(tab_m.head(10))
display(tab_runtime.head(10))
