# Real Data Analysis

In [8]:
import os, pickle
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import warnings
from typing import Any, Dict
from collections import Counter
warnings.filterwarnings("ignore")
%matplotlib inline

# Pfad ggf. anpassen:
pkl_path = "ResultsRealData/RealData_eQTL.pkl"

with open(pkl_path, "rb") as f:
    bundle = pickle.load(f)

results = bundle.get("results", {})
runtimes = bundle.get("runtimes", {})
method_names = bundle.get("method_names", list(results.keys()))
n = bundle.get("n"); p = bundle.get("p")

print("Geladen:", pkl_path)
print("Methoden:", method_names)
print("n =", n, " p =", p)

Geladen: ResultsRealData/RealData_eQTL.pkl
Methoden: ['stepwise_plain', 'L0opt_CDPSI', 'L0opt_CD', 'lassonet', 'lassonet_plus', 'deep2stage', 'deep2stage_plus']
n = 210  p = 47292


In [9]:
def _to_idx_0based(a: Any):
    if a is None: return None
    arr = np.asarray(a).astype(int).ravel()
    return arr if arr.size == 0 else (arr - 1)  # 1-basiert -> 0-basiert

def _has(x, name: str) -> bool:
    return (isinstance(x, dict) and name in x) or hasattr(x, name)

def _get(x, name: str):
    if isinstance(x, dict): return x.get(name, None)
    return getattr(x, name, None)

def extract_modelselresult(res) -> Dict[str, Any]:
    if res is None: return {}
    if not any(_has(res, k) for k in ["mBIC", "mBIC2", "model1", "model2"]):
        return {}
    mBIC  = _get(res, "mBIC")
    mBIC2 = _get(res, "mBIC2")
    S1 = _to_idx_0based(_get(res, "model1"))
    S2 = _to_idx_0based(_get(res, "model2"))
    return {
        "mBIC":  float(mBIC)  if mBIC  is not None else np.nan,
        "mBIC2": float(mBIC2) if mBIC2 is not None else np.nan,
        "support_mBIC":  S1,
        "support_mBIC2": S2,
        "k_mBIC":  None if S1 is None else int(len(S1)),
        "k_mBIC2": None if S2 is None else int(len(S2)),
    }

# ---- Summary bauen ----
rows = []
support_mbic_by_method  = {}
support_mbic2_by_method = {}

for name in method_names:
    res = results.get(name)
    rt  = float(runtimes.get(name, np.nan))
    ext = extract_modelselresult(res)

    support_mbic_by_method[name]  = ext.get("support_mBIC")
    support_mbic2_by_method[name] = ext.get("support_mBIC2")

    rows.append({
        "method": name,
        "runtime (in sec)": rt,
        "k_mBIC":  ext.get("k_mBIC"),
        "k_mBIC2": ext.get("k_mBIC2"),
        "mBIC":   ext.get("mBIC"),
        "mBIC2":  ext.get("mBIC2"),
    })

# 1) Summary-Datenrahmen bauen
summary_df = pd.DataFrame(rows)

# 2) Spalten umbenennen
summary_df = summary_df.rename(columns={
    "k_mBIC":  "model size (mBIC)",
    "k_mBIC2": "model size (mBIC2)",
})

# 3) Spaltenreihenfolge: runtime links
desired = [
    "method",
    "model size (mBIC)",
    "model size (mBIC2)",
    "mBIC",
    "mBIC2",
    "runtime (in sec)"
]
summary_df = summary_df[[c for c in desired if c in summary_df.columns]]

# 4) mBIC/mBIC2 schön formatieren (nur Darstellung)
display(
    summary_df.style.format({
        "mBIC":  lambda v: "" if pd.isna(v) else f"{v:.2f}",
        "mBIC2": lambda v: "" if pd.isna(v) else f"{v:.2f}",
        "runtime (in sec)": lambda v: "" if pd.isna(v) else f"{v:.2f}",
    })
)

Unnamed: 0,method,model size (mBIC),model size (mBIC2),mBIC,mBIC2,runtime (in sec)
0,stepwise_plain,4,5,-270.9,-277.79,69.23
1,L0opt_CDPSI,5,5,-265.59,-275.16,63.34
2,L0opt_CD,5,5,-265.59,-275.16,2.21
3,lassonet,3,3,-257.6,-261.19,48.27
4,lassonet_plus,4,4,-269.07,-275.43,41.8
5,deep2stage,0,0,0.0,0.0,29.98
6,deep2stage_plus,4,5,-267.69,-266.59,41.55


In [29]:
# --- Supports aus ModelSelResult holen (model1/model2 sind 1-basiert) ---
def _to0(a):
    a = np.asarray(a, dtype=int).ravel()
    return a-1 if a.size else a

supports_mBIC, supports_mBIC2 = {}, {}
for m in method_names:
    res = results.get(m)
    supports_mBIC[m]  = _to0(getattr(res, "model1", []))
    supports_mBIC2[m] = _to0(getattr(res, "model2", []))

# --- Methoden filtern: deep2stage immer raus, außerdem alle ohne Auswahl ---
drop_always = {"deep2stage",}  # <- hier steht die Methode, die du sicher ausschließen willst
method_names_tbl = []
dropped = []
for m in method_names:
    if m in drop_always:
        dropped.append(m); continue
    if supports_mBIC[m].size == 0 and supports_mBIC2[m].size == 0:
        dropped.append(m); continue
    method_names_tbl.append(m)

# --- Union der ausgewählten Features über die verbleibenden Methoden ---
all_idx = set()
for m in method_names_tbl:
    if supports_mBIC[m].size:  all_idx |= set(supports_mBIC[m].tolist())
    if supports_mBIC2[m].size: all_idx |= set(supports_mBIC2[m].tolist())
all_idx = sorted(all_idx)

# --- Mapping Feature -> (variable, gene name) aus der RData-Datei ---
rdata_path = "Data/Sangerdata.Rdata"  # ggf. anpassen
gene_row   = 24266 - 1                # dein Zielgen (wie im Fit), ggf. anpassen

import pyreadr
r = pyreadr.read_r(rdata_path)
df = r["data"]
total_rows = df.shape[0]
predictor_rows = np.delete(np.arange(total_rows), gene_row)      # alle ausser Zielgen
var_numbers = (predictor_rows + 1).astype(int)                   # 1-basiert wie im Paper
gene_names  = df.iloc[predictor_rows, 0].astype(str).values


# --- Tabelle bauen (0/1/2-Kodierung) ---
rows = []
for j in all_idx:
    row = {"variable": int(var_numbers[j]), "gene name": gene_names[j]}
    for m in method_names_tbl:
        S1 = set(supports_mBIC[m].tolist())  if supports_mBIC[m].size  else set()
        S2 = set(supports_mBIC2[m].tolist()) if supports_mBIC2[m].size else set()
        if (j in S1) and (j in S2):
            code = 2
        elif (j in S2):
            code = 1
        elif (j in S1):
            code = 3
        else:
            code = 0
        row[m] = code
    rows.append(row)

selection_table = pd.DataFrame(rows).sort_values("variable").reset_index(drop=True)

rename_map = {
    "stepwise_plain": "stepwise",
    "L0opt_CD": "L0opt_CD",
    "L0opt_CDPSI": "L0opt_CDPSI",
    "lassonet": "LassoNet",
    "lassonet_plus": "LassoNet+",
    "deep2stage_plus": "Deep2Stage+",
}
selection_table = selection_table.rename(columns={k:v for k,v in rename_map.items() if k in selection_table.columns})
left_cols   = [c for c in ["variable", "gene name"] if c in selection_table.columns]
center_cols = [c for c in selection_table.columns if c not in left_cols]

display(
    selection_table.style
        .set_properties(subset=left_cols,   **{"text-align": "left",  "padding-left": "10px"})
        .set_properties(subset=center_cols, **{"text-align": "center"})
        .set_table_styles([
            {"selector": "th.col_heading", "props": [("text-align", "left")]},   # Header links
            {"selector": "th.row_heading", "props": [("text-align", "left")]}
        ], overwrite=False)
)
#display(selection_table)


Unnamed: 0,variable,gene name,stepwise,L0opt_CDPSI,L0opt_CD,LassoNet,LassoNet+,Deep2Stage+
0,6537,GI_22749104-S,0,2,2,0,0,0
1,7860,GI_25453471-A,2,2,2,2,2,2
2,7943,GI_25777737-S,0,0,0,2,2,0
3,7973,GI_25952136-S,0,0,0,0,0,1
4,16420,GI_37556035-S,2,0,0,0,0,0
5,20054,GI_42558257-S,0,0,0,0,0,3
6,21982,GI_4502890-S,0,0,0,0,0,1
7,30296,Hs.430274-S,0,0,0,0,0,1
8,33116,Hs.473980-S,1,0,0,0,0,0
9,37352,Hs.522669-S,0,2,2,0,0,0


In [22]:
# --- Korrelationstabelle ---

# Indizes (0-basiert) aus deinen bereits gebauten Dicts
step_idx0 = supports_mBIC2["stepwise_plain"]
l0_idx0   = supports_mBIC2["L0opt_CD"]  # ggf. String anpassen, falls der Methodenname anders ist

# Prädiktor-Matrix X (Samples x Features), konsistent zu var_numbers/gene_names
X = df.drop(index=gene_row).iloc[:, 1:].astype(float).values.T
n = X.shape[0]

# Teilmatrizen
A = X[:, step_idx0]      # n x r (stepwise)
B = X[:, l0_idx0]        # n x c (L0opt_CD)

# z-Standardisierung spaltenweise → Korrelation = (ZᵀZ)/n
ZA = (A - A.mean(axis=0)) / (A.std(axis=0, ddof=0) + 1e-12)
ZB = (B - B.mean(axis=0)) / (B.std(axis=0, ddof=0) + 1e-12)
corr = (ZA.T @ ZB) / float(n)   # r x c

# Beschriftungen "variable | gene name" auf Basis deiner Mapping-Arrays
row_labels = [f"{int(var_numbers[j])}" for j in step_idx0]
col_labels = [f"{int(var_numbers[j])}" for j in l0_idx0]

corr_df = pd.DataFrame(corr, index=row_labels, columns=col_labels).round(3)

# Pretty display: more padding + caption as title
display(
    corr_df.style
        .format("{:.3f}")
        .set_caption("Correlation between stepwise and L0opt variables (mBIC2)")
        .set_table_styles([
            {"selector": "caption",
             "props": [("caption-side", "top"),
                       ("font-size", "14px"),
                       ("font-weight", "600"),
                       ("padding-bottom", "10px")]},
            {"selector": "th.col_heading", "props": [("padding", "10px 16px")]},
            {"selector": "th.row_heading", "props": [("padding", "10px 12px")]},
            {"selector": "td",             "props": [("padding", "12px 20px")]}
        ])
        .set_properties(**{"text-align": "center"})
)

Unnamed: 0,6537,7860,37352,37853,42863
42863,0.02,0.342,-0.189,0.218,1.0
7860,-0.247,1.0,-0.11,-0.228,0.342
37853,0.151,-0.228,-0.056,1.0,0.218
16420,-0.255,0.348,0.159,0.15,0.222
33116,-0.101,-0.04,0.068,-0.149,-0.104


In [23]:
# --- Korrelationstabelle ---

# Indizes (0-basiert) aus deinen bereits gebauten Dicts
step_idx0 = supports_mBIC2["stepwise_plain"]
l0_idx0   = supports_mBIC2["lassonet_plus"]  # ggf. String anpassen, falls der Methodenname anders ist

# Prädiktor-Matrix X (Samples x Features), konsistent zu var_numbers/gene_names
X = df.drop(index=gene_row).iloc[:, 1:].astype(float).values.T
n = X.shape[0]

# Teilmatrizen
A = X[:, step_idx0]      # n x r (stepwise)
B = X[:, l0_idx0]        # n x c (L0opt_CD)

# z-Standardisierung spaltenweise → Korrelation = (ZᵀZ)/n
ZA = (A - A.mean(axis=0)) / (A.std(axis=0, ddof=0) + 1e-12)
ZB = (B - B.mean(axis=0)) / (B.std(axis=0, ddof=0) + 1e-12)
corr = (ZA.T @ ZB) / float(n)   # r x c

# Beschriftungen "variable | gene name" auf Basis deiner Mapping-Arrays
row_labels = [f"{int(var_numbers[j])}" for j in step_idx0]
col_labels = [f"{int(var_numbers[j])}" for j in l0_idx0]

corr_df = pd.DataFrame(corr, index=row_labels, columns=col_labels).round(3)

# Pretty display: more padding + caption as title
display(
    corr_df.style
        .format("{:.3f}")
        .set_caption("Correlation between stepwise and LassoNet_plus variables (mBIC2)")
        .set_table_styles([
            {"selector": "caption",
             "props": [("caption-side", "top"),
                       ("font-size", "14px"),
                       ("font-weight", "600"),
                       ("padding-bottom", "10px")]},
            {"selector": "th.col_heading", "props": [("padding", "10px 16px")]},
            {"selector": "th.row_heading", "props": [("padding", "10px 12px")]},
            {"selector": "td",             "props": [("padding", "12px 20px")]}
        ])
        .set_properties(**{"text-align": "center"})
)

Unnamed: 0,7860,7943,42863,44914
42863,0.342,0.409,1.0,0.062
7860,1.0,0.43,0.342,-0.203
37853,-0.228,-0.312,0.218,0.353
16420,0.348,0.275,0.222,-0.314
33116,-0.04,0.012,-0.104,-0.213


In [24]:
# --- Korrelationstabelle ---

# Indizes (0-basiert) aus deinen bereits gebauten Dicts
step_idx0 = supports_mBIC2["stepwise_plain"]
l0_idx0   = supports_mBIC2["deep2stage_plus"]  # ggf. String anpassen, falls der Methodenname anders ist

# Prädiktor-Matrix X (Samples x Features), konsistent zu var_numbers/gene_names
X = df.drop(index=gene_row).iloc[:, 1:].astype(float).values.T
n = X.shape[0]

# Teilmatrizen
A = X[:, step_idx0]      # n x r (stepwise)
B = X[:, l0_idx0]        # n x c (L0opt_CD)

# z-Standardisierung spaltenweise → Korrelation = (ZᵀZ)/n
ZA = (A - A.mean(axis=0)) / (A.std(axis=0, ddof=0) + 1e-12)
ZB = (B - B.mean(axis=0)) / (B.std(axis=0, ddof=0) + 1e-12)
corr = (ZA.T @ ZB) / float(n)   # r x c

# Beschriftungen "variable | gene name" auf Basis deiner Mapping-Arrays
row_labels = [f"{int(var_numbers[j])}" for j in step_idx0]
col_labels = [f"{int(var_numbers[j])}" for j in l0_idx0]

corr_df = pd.DataFrame(corr, index=row_labels, columns=col_labels).round(3)

# Pretty display: more padding + caption as title
display(
    corr_df.style
        .format("{:.3f}")
        .set_caption("Correlation between stepwise and deep2stage_plus variables (mBIC2)")
        .set_table_styles([
            {"selector": "caption",
             "props": [("caption-side", "top"),
                       ("font-size", "14px"),
                       ("font-weight", "600"),
                       ("padding-bottom", "10px")]},
            {"selector": "th.col_heading", "props": [("padding", "10px 16px")]},
            {"selector": "th.row_heading", "props": [("padding", "10px 12px")]},
            {"selector": "td",             "props": [("padding", "12px 20px")]}
        ])
        .set_properties(**{"text-align": "center"})
)

Unnamed: 0,7860,7973,21982,30296,42863
42863,0.342,-0.134,0.464,-0.116,1.0
7860,1.0,-0.041,0.577,0.054,0.342
37853,-0.228,-0.179,-0.224,-0.271,0.218
16420,0.348,0.178,0.395,0.152,0.222
33116,-0.04,0.166,0.071,0.26,-0.104


In [18]:
# --- Korrelationstabelle: Zeilen = stepwise_plain (mBIC2), Spalten = lassonet_plus (mBIC2) ---

# Indizes (0-basiert) aus deinen bereits gebauten Dicts
step_idx0 = supports_mBIC2["L0opt_CD"]
l0_idx0   = supports_mBIC2["lassonet_plus"]  # ggf. String anpassen, falls der Methodenname anders ist

# Prädiktor-Matrix X (Samples x Features), konsistent zu var_numbers/gene_names
X = df.drop(index=gene_row).iloc[:, 1:].astype(float).values.T
n = X.shape[0]

# Teilmatrizen
A = X[:, step_idx0]      # n x r (stepwise)
B = X[:, l0_idx0]        # n x c (L0opt_CD)

# z-Standardisierung spaltenweise → Korrelation = (ZᵀZ)/n
ZA = (A - A.mean(axis=0)) / (A.std(axis=0, ddof=0) + 1e-12)
ZB = (B - B.mean(axis=0)) / (B.std(axis=0, ddof=0) + 1e-12)
corr = (ZA.T @ ZB) / float(n)   # r x c

# Beschriftungen "variable | gene name" auf Basis deiner Mapping-Arrays
row_labels = [f"{int(var_numbers[j])}" for j in step_idx0]
col_labels = [f"{int(var_numbers[j])}" for j in l0_idx0]

corr_df = pd.DataFrame(corr, index=row_labels, columns=col_labels).round(3)

# Pretty display: more padding + caption as title
display(
    corr_df.style
        .format("{:.3f}")
        .set_caption("Correlation between L0opt and LassoNet_plus variables (mBIC2)")
        .set_table_styles([
            {"selector": "caption",
             "props": [("caption-side", "top"),
                       ("font-size", "14px"),
                       ("font-weight", "600"),
                       ("padding-bottom", "10px")]},
            {"selector": "th.col_heading", "props": [("padding", "10px 16px")]},
            {"selector": "th.row_heading", "props": [("padding", "10px 12px")]},
            {"selector": "td",             "props": [("padding", "12px 20px")]}
        ])
        .set_properties(**{"text-align": "center"})
)

Unnamed: 0,7860,7943,42863,44914
6537,-0.247,-0.157,0.02,0.238
7860,1.0,0.43,0.342,-0.203
37352,-0.11,-0.123,-0.189,-0.346
37853,-0.228,-0.312,0.218,0.353
42863,0.342,0.409,1.0,0.062


In [25]:
# --- Korrelationstabelle: Zeilen = stepwise_plain (mBIC2), Spalten = lassonet_plus (mBIC2) ---

# Indizes (0-basiert) aus deinen bereits gebauten Dicts
step_idx0 = supports_mBIC2["L0opt_CD"]
l0_idx0   = supports_mBIC2["deep2stage_plus"]  # ggf. String anpassen, falls der Methodenname anders ist

# Prädiktor-Matrix X (Samples x Features), konsistent zu var_numbers/gene_names
X = df.drop(index=gene_row).iloc[:, 1:].astype(float).values.T
n = X.shape[0]

# Teilmatrizen
A = X[:, step_idx0]      # n x r (stepwise)
B = X[:, l0_idx0]        # n x c (L0opt_CD)

# z-Standardisierung spaltenweise → Korrelation = (ZᵀZ)/n
ZA = (A - A.mean(axis=0)) / (A.std(axis=0, ddof=0) + 1e-12)
ZB = (B - B.mean(axis=0)) / (B.std(axis=0, ddof=0) + 1e-12)
corr = (ZA.T @ ZB) / float(n)   # r x c

# Beschriftungen "variable | gene name" auf Basis deiner Mapping-Arrays
row_labels = [f"{int(var_numbers[j])}" for j in step_idx0]
col_labels = [f"{int(var_numbers[j])}" for j in l0_idx0]

corr_df = pd.DataFrame(corr, index=row_labels, columns=col_labels).round(3)

# Pretty display: more padding + caption as title
display(
    corr_df.style
        .format("{:.3f}")
        .set_caption("Correlation between L0opt and deep2stage_plus variables (mBIC2)")
        .set_table_styles([
            {"selector": "caption",
             "props": [("caption-side", "top"),
                       ("font-size", "14px"),
                       ("font-weight", "600"),
                       ("padding-bottom", "10px")]},
            {"selector": "th.col_heading", "props": [("padding", "10px 16px")]},
            {"selector": "th.row_heading", "props": [("padding", "10px 12px")]},
            {"selector": "td",             "props": [("padding", "12px 20px")]}
        ])
        .set_properties(**{"text-align": "center"})
)

Unnamed: 0,7860,7973,21982,30296,42863
6537,-0.247,-0.074,-0.227,-0.075,0.02
7860,1.0,-0.041,0.577,0.054,0.342
37352,-0.11,0.142,-0.109,0.383,-0.189
37853,-0.228,-0.179,-0.224,-0.271,0.218
42863,0.342,-0.134,0.464,-0.116,1.0


In [26]:
# --- Korrelationstabelle: Zeilen = stepwise_plain (mBIC2), Spalten = lassonet_plus (mBIC2) ---

# Indizes (0-basiert) aus deinen bereits gebauten Dicts
step_idx0 = supports_mBIC2["lassonet_plus"]
l0_idx0   = supports_mBIC2["deep2stage_plus"]  # ggf. String anpassen, falls der Methodenname anders ist

# Prädiktor-Matrix X (Samples x Features), konsistent zu var_numbers/gene_names
X = df.drop(index=gene_row).iloc[:, 1:].astype(float).values.T
n = X.shape[0]

# Teilmatrizen
A = X[:, step_idx0]      # n x r (stepwise)
B = X[:, l0_idx0]        # n x c (L0opt_CD)

# z-Standardisierung spaltenweise → Korrelation = (ZᵀZ)/n
ZA = (A - A.mean(axis=0)) / (A.std(axis=0, ddof=0) + 1e-12)
ZB = (B - B.mean(axis=0)) / (B.std(axis=0, ddof=0) + 1e-12)
corr = (ZA.T @ ZB) / float(n)   # r x c

# Beschriftungen "variable | gene name" auf Basis deiner Mapping-Arrays
row_labels = [f"{int(var_numbers[j])}" for j in step_idx0]
col_labels = [f"{int(var_numbers[j])}" for j in l0_idx0]

corr_df = pd.DataFrame(corr, index=row_labels, columns=col_labels).round(3)

# Pretty display: more padding + caption as title
display(
    corr_df.style
        .format("{:.3f}")
        .set_caption("Correlation between LassoNet_plus and deep2stage_plus variables (mBIC2)")
        .set_table_styles([
            {"selector": "caption",
             "props": [("caption-side", "top"),
                       ("font-size", "14px"),
                       ("font-weight", "600"),
                       ("padding-bottom", "10px")]},
            {"selector": "th.col_heading", "props": [("padding", "10px 16px")]},
            {"selector": "th.row_heading", "props": [("padding", "10px 12px")]},
            {"selector": "td",             "props": [("padding", "12px 20px")]}
        ])
        .set_properties(**{"text-align": "center"})
)

Unnamed: 0,7860,7973,21982,30296,42863
7860,1.0,-0.041,0.577,0.054,0.342
7943,0.43,0.018,0.599,-0.015,0.409
42863,0.342,-0.134,0.464,-0.116,1.0
44914,-0.203,-0.206,-0.279,-0.433,0.062
