# Bayesian light-curve excursion output EDA

Inspect `output/lc_excursions_bayes_results_12.5_13.parquet` produced by `malca/events.py`. Update `DATA_PATH` below if you want to point at another run (e.g., a different magnitude bin).

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

sns.set_theme(style="ticks", context="talk")
plt.rcParams["figure.figsize"] = (10, 6)
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 160)


In [None]:
DATA_PATH = Path("../output/lc_excursions_bayes_results_12.5_13.parquet")

if not DATA_PATH.exists():
    raise FileNotFoundError(f"Expected Parquet at {DATA_PATH}. Update DATA_PATH to match your run.")


In [None]:

from pathlib import Path


def assert_parquet_magic(path: Path):
    with path.open('rb') as f:
        head = f.read(4)
        try:
            f.seek(-4, 2)
            tail = f.read(4)
        except OSError:
            tail = b''
    if head != b'PAR1' or tail != b'PAR1':
        raise RuntimeError(
            "File does not look like a complete Parquet file (missing magic bytes). "
            "It may be truncated or still writing; rerun the generator to regenerate it."
        )


def load_results(path: Path) -> pd.DataFrame:
    assert_parquet_magic(path)
    try:
        df = pd.read_parquet(path)
    except ImportError as exc:
        raise ImportError(
            "Install a Parquet engine (e.g., `pip install pyarrow` or `conda install -c conda-forge pyarrow`)."
        ) from exc
    except Exception as exc:
        # Fallback that ignores pandas extension metadata to avoid ArrowKeyError
        try:
            import pyarrow.parquet as pq
            table = pq.read_table(path, use_pandas_metadata=False)
            df = table.to_pandas()
            print(f"Loaded via pyarrow fallback (ignored pandas metadata). Original error: {exc}")
        except Exception as exc2:
            raise RuntimeError(f"Failed to read parquet at {path}: {exc}") from exc2

    df = df.copy()
    df["source_file"] = df["path"].astype(str)
    df["target_id"] = df["source_file"].map(lambda p: Path(p).stem.replace("-light-curves", ""))
    stem = path.stem
    mag_token = stem.split("results_", 1)[-1] if "results_" in stem else stem
    df["mag_bin"] = mag_token.split("_PROCESSED", 1)[0]
    return df


df = load_results(DATA_PATH)
df.head()


## Overview

In [None]:
overview = pd.Series({
    "rows": len(df),
    "unique_targets": df["target_id"].nunique(),
    "mag_bins": ", ".join(sorted(df["mag_bin"].unique())),
    "significant_dips": int(df["dip_significant"].sum()),
    "significant_jumps": int(df["jump_significant"].sum()),
    "significant_both": int((df["dip_significant"] & df["jump_significant"]).sum()),
})
overview


In [None]:
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
df[numeric_cols].describe().T


## Significance flags

In [None]:
sig_status = np.select(
    [
        df["dip_significant"] & df["jump_significant"],
        df["dip_significant"],
        df["jump_significant"],
    ],
    ["dip & jump", "dip only", "jump only"],
    default="none",
)
sig_counts = pd.Series(sig_status).value_counts().rename_axis("status").reset_index(name="count")
sig_counts["fraction"] = sig_counts["count"] / len(df)

display(sig_counts)

sns.barplot(data=sig_counts, x="status", y="count", palette="viridis")
plt.title("Significance counts")
sns.despine()
plt.show()


## Bayes factor distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, kind in zip(axes, ["dip", "jump"]):
    col = f"{kind}_bayes_factor"
    vals = df[col].replace([np.inf, -np.inf], np.nan).dropna()
    sns.histplot(np.log10(np.clip(vals, 1e-8, None)), bins=40, ax=ax)
    ax.set_xlabel(f"log10({col})")
    ax.set_title(f"{kind.title()} Bayes factor (log10)")
    ax.axvline(0, color="red", linestyle="--", linewidth=1)
    ax.grid(axis="y", alpha=0.3)
sns.despine()
plt.tight_layout()
plt.show()


## Run-level diagnostics

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=False, sharey=False)
pairs = [
    ("dip_max_run_sum", "dip_max_log_bf_local", "dip_significant", "Dip"),
    ("jump_max_run_sum", "jump_max_log_bf_local", "jump_significant", "Jump"),
]

for ax, (xcol, ycol, flag, label) in zip(axes, pairs):
    sns.scatterplot(
        data=df,
        x=xcol,
        y=ycol,
        hue=flag,
        palette={True: "#1b9e77", False: "#d95f02"},
        alpha=0.6,
        ax=ax,
    )
    ax.set_title(f"{label}: run summary vs. per-point log BF")
    ax.legend(title="significant")
    ax.grid(alpha=0.3)
sns.despine()
plt.tight_layout()
plt.show()


## Morphology breakdown

In [None]:
morph = (
    df[["target_id", "dip_best_morph", "jump_best_morph"]]
    .rename(columns={"dip_best_morph": "dip", "jump_best_morph": "jump"})
    .melt(id_vars="target_id", var_name="kind", value_name="morphology")
)
order = morph["morphology"].value_counts().index

sns.countplot(data=morph, y="morphology", hue="kind", order=order)
plt.title("Best-fit morphology counts")
sns.despine()
plt.show()


## Top candidates

In [None]:
def top_candidates(df: pd.DataFrame, kind: str, n: int = 15):
    cols = [
        "target_id",
        "path",
        f"{kind}_bayes_factor",
        f"{kind}_max_run_sum",
        f"{kind}_max_log_bf_local",
        f"{kind}_max_run_duration",
        f"{kind}_max_run_points",
        f"{kind}_best_morph",
        f"{kind}_best_delta_bic",
        f"{kind}_best_width_param",
    ]
    cols = [c for c in cols if c in df.columns]
    subset = df[df[f"{kind}_significant"]].sort_values(by=f"{kind}_bayes_factor", ascending=False)
    return subset[cols].head(n)

top_dips = top_candidates(df, "dip")
top_jumps = top_candidates(df, "jump")

display(top_dips)
display(top_jumps)
