# Profiling `malca.events` on SkyPatrol light curves

Use `cProfile` to time the Bayesian event scorer on real SkyPatrol CSVs stored in `input/skypatrol2`. The notebook keeps everything self contained so it can run directly against the repo without extra setup.

## Environment setup

Locate the repo root, add it to `sys.path`, and collect the SkyPatrol CSVs we'll profile.

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

# Find repo root whether the notebook is run from notebooks/ or repo root
candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]
repo_root = next((p for p in candidates if (p / "malca").exists()), Path.cwd())

for path in (repo_root, repo_root / "malca"):
    sp = str(path.resolve())
    if sp not in sys.path:
        sys.path.insert(0, sp)

data_dir = repo_root / "input" / "skypatrol2"
lc_paths = sorted(data_dir.glob("*-light-curves.csv"))

print(f"Repo root: {repo_root}")
print(f"Found {len(lc_paths)} light curves in {data_dir}")
if not lc_paths:
    raise FileNotFoundError("No SkyPatrol CSVs found; adjust data_dir above.")

## Quick peek at one SkyPatrol light curve

Read a single CSV to confirm the loader works and to see the columns that flow into the scorer.

In [None]:
from malca.plot import read_skypatrol_csv

example_path = lc_paths[0]
df_example = read_skypatrol_csv(example_path)
print(f"{example_path.name}: {len(df_example)} rows")
display(df_example.head())

## Profiling helpers

Wrap `_process_one` so we can reuse the same kwargs as the CLI and capture both `cProfile` output and a tidy summary table of the hottest functions.

In [None]:
import cProfile
import io
import pstats
import time

import malca.events as events

DEFAULT_EVENT_KWARGS = {
    "trigger_mode": "posterior_prob",
    "logbf_threshold_dip": 5.0,
    "logbf_threshold_jump": 5.0,
    "significance_threshold": 99.99997,
    "p_points": 80,
    "p_min_dip": None,
    "p_max_dip": None,
    "p_min_jump": None,
    "p_max_jump": None,
    "run_min_points": 3,
    "run_allow_gap_points": 1,
    "run_max_gap_days": None,
    "run_min_duration_days": None,
    "run_sum_threshold": None,
    "run_sum_multiplier": 2.5,
    "baseline_tag": "gp",
    "use_sigma_eff": True,
    "require_sigma_eff": True,
    "compute_event_prob": True,
}

def score_single_lc(path: Path, **overrides):
    kwargs = dict(DEFAULT_EVENT_KWARGS)
    kwargs.update(overrides)
    return events._process_one(str(path), **kwargs)

def stats_to_frame(stats_obj, limit=20):
    rows = []
    for func, (cc, nc, tt, ct, callers) in stats_obj.stats.items():
        rows.append(
            {
                "func": f"{func[2]} ({Path(func[0]).name}:{func[1]})",
                "ncalls": nc,
                "tottime_s": tt,
                "cumtime_s": ct,
            }
        )
    df_stats = pd.DataFrame(rows)
    if df_stats.empty:
        return df_stats
    return df_stats.sort_values("cumtime_s", ascending=False).head(limit)

def profile_light_curve(path: Path, stats_limit=25, **overrides):
    profiler = cProfile.Profile()
    start = time.perf_counter()
    result = profiler.runcall(score_single_lc, path, **overrides)
    elapsed = time.perf_counter() - start

    stats_buffer = io.StringIO()
    stats_obj = pstats.Stats(profiler, stream=stats_buffer).strip_dirs().sort_stats("cumtime")
    stats_obj.print_stats(stats_limit)
    stats_text = stats_buffer.getvalue()

    top_df = stats_to_frame(stats_obj, limit=stats_limit)
    return result, stats_text, top_df, elapsed

## Profile a single light curve

Run the Bayesian scorer on one SkyPatrol CSV (default parameters match the CLI). `stats_df_single` highlights the functions consuming the most cumulative time.

In [None]:
single_path = lc_paths[0]
result_single, stats_text_single, stats_df_single, elapsed_single = profile_light_curve(single_path, stats_limit=25)

print(f"Profiled {single_path.name} in {elapsed_single:.2f} s")
print(stats_text_single)
stats_df_single

## Batch timing on a handful of files (optional)

Process a small subset sequentially to gauge throughput without full profiling. Adjust `N_LC` to cover more files if needed.

In [None]:
N_LC = 3
subset_paths = lc_paths[:N_LC]

timing_rows = []
for path in subset_paths:
    t0 = time.perf_counter()
    res = score_single_lc(path)
    timing_rows.append(
        {
            "path": path.name,
            "elapsed_s": time.perf_counter() - t0,
            "n_points": res.get("n_points"),
            "dip_sig": res.get("dip_significant"),
            "jump_sig": res.get("jump_significant"),
            "dip_bf": res.get("dip_bayes_factor"),
            "jump_bf": res.get("jump_bayes_factor"),
        }
    )

pd.DataFrame(timing_rows)