In [2]:
#cell 1

DL2_DIR = r"../dl2_gamma"
PATTERNS = ["gamma_*.h5", "proton_*.h5"]  

import os, glob, h5py, textwrap, inspect
from pathlib import Path

import numpy as np
import pandas as pd

import astropy.units as u
from astropy.table import Table

from ctapipe.io import read_table

# Optional: matplotlib quick plots
import matplotlib.pyplot as plt


In [37]:
#cell 2
files = []
for pat in PATTERNS:
    files.extend(sorted(glob.glob(str(Path(DL2_DIR) / pat))))
print(f"Found {len(files)} files")
for f in files:
    print(" -", Path(f).name)


Found 12 files
 - gamma_200_300E3GeV_20_20deg_testing_dl1_dl2.h5
 - gamma_200_300E3GeV_30_30deg_testing_dl1_dl2.h5
 - gamma_200_300E3GeV_40_40deg_testing_dl1_dl2.h5
 - gamma_200_300E3GeV_60_60deg_testing_dl1_dl2.h5
 - gamma_point_50_300E3GeV_20_20deg_testing_dl1_dl2.h5
 - gamma_point_50_300E3GeV_30_30deg_testing_dl1_dl2.h5
 - gamma_point_50_300E3GeV_40_40deg_testing_dl1_dl2.h5
 - gamma_point_50_300E3GeV_60_60deg_testing_dl1_dl2.h5
 - proton_400_500E3GeV_20_20deg_testing_dl1_dl2.h5
 - proton_400_500E3GeV_30_30deg_testing_dl1_dl2.h5
 - proton_400_500E3GeV_40_40deg_testing_dl1_dl2.h5
 - proton_400_500E3GeV_60_60deg_testing_dl1_dl2.h5


In [None]:
#cell 3
# List available DL2 parameter tables in the FIRST matching DL2 file, using your existing DL2_DIR and PATTERNS variables

import glob, h5py
from pathlib import Path

# Use existing variables; fall back gracefully if PATTERNS not defined
try:
    patterns = PATTERNS
except NameError:
    patterns = ["*.h5"]

files = []
for pat in patterns:
    files.extend(sorted(glob.glob(str(Path(DL2_DIR) / pat))))

assert files, f"No DL2 files found in {DL2_DIR} with patterns {patterns}"

first_file = files[0]
print("Inspecting file:", Path(first_file).name)

def list_parameter_tables(h5file):
    out = []
    with h5py.File(h5file, "r") as h5:
        base = "/dl2/event/telescope/parameters"
        if base not in h5:
            print("Not found:", base)
            return out
        for name in h5[base].keys():
            out.append(f"{base}/{name}")
    return out

candidates = list_parameter_tables(first_file)
print("Found parameter tables:")
for p in candidates:
    print(" -", p)


Inspecting file: gamma_200_300E3GeV_20_20deg_testing_dl1_dl2.h5
Found parameter tables:
 - /dl2/event/telescope/parameters/stereo


In [40]:
#cell 4
from ctapipe.io import read_table
import numpy as np
import astropy.units as u

table_path = "/dl2/event/telescope/parameters/stereo"
tab = read_table(first_file, table_path)

print(f"Rows: {len(tab)} | Columns: {len(tab.colnames)}")
print("First 30 columns:", tab.colnames[:30])

expected = ["true_energy","reco_energy","gammaness","true_alt","true_az","reco_alt","reco_az"]
missing = [c for c in expected if c not in tab.colnames]
print("Missing expected columns:", missing)
if missing:
    print("⚠️ WARNING: Some expected physics columns are missing!")

def to_value_tev(col):
    try:
        return np.asarray(col.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    try:
        return np.asarray(col.quantity.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    return np.asarray(col, dtype=float)


# ----------------------------------------
# Resolve true/reco energy column names
# ----------------------------------------

true_name = "true_energy" if "true_energy" in tab.colnames else None
reco_name = "reco_energy" if "reco_energy" in tab.colnames else None

true_used_fallback = False
reco_used_fallback = False

# true-energy fallbacks
for alt in ["mc_energy", "sim_energy_true"]:
    if true_name is None and alt in tab.colnames:
        true_name = alt
        true_used_fallback = True

# reco-energy fallbacks
for alt in ["energy", "reco_energy_mean"]:
    if reco_name is None and alt in tab.colnames:
        reco_name = alt
        reco_used_fallback = True

# ----------------------------------------
# Print selected columns and warnings
# ----------------------------------------

if true_name and reco_name:
    if true_used_fallback:
        print(f"⚠️ WARNING: true_energy fallback used -> '{true_name}'")
    if reco_used_fallback:
        print(f"⚠️ WARNING: reco_energy fallback used -> '{reco_name}'")

    print(f"Energy columns used: true='{true_name}', reco='{reco_name}'")

    te = to_value_tev(tab[true_name])
    re = to_value_tev(tab[reco_name])

    print("true_energy range [TeV]:", float(np.nanmin(te)), "→", float(np.nanmax(te)))
    print("reco_energy range [TeV]:", float(np.nanmin(re)), "→", float(np.nanmax(re)))

    # ----------------------------------------
    # Unit sanity checks
    # ----------------------------------------

    if np.nanmax(te) > 500:
        print("⚠️ WARNING: true_energy appears too large → likely in GeV, not TeV.")

    if np.nanmax(re) > 500:
        print("⚠️ WARNING: reco_energy appears too large → likely in GeV, not TeV.")

    if np.nanmin(te) < 1e-5:
        print("⚠️ WARNING: true_energy minimum is extremely low → check units.")

    if np.nanmin(re) < 1e-5:
        print("⚠️ WARNING: reco_energy minimum is extremely low → check units.")

    # ----------------------------------------
    # Correlation test (important physics sanity)
    # ----------------------------------------

    mask = np.isfinite(te) & np.isfinite(re)
    if np.sum(mask) > 100:
        corr = np.corrcoef(te[mask], re[mask])[0, 1]
        print("Energy correlation (true vs reco):", corr)

        if corr < 0.4:
            print("⚠️ WARNING: Very low energy correlation → reconstruction may be broken.")
        elif corr > 0.98:
            print("⚠️ WARNING: Energy correlation too high → reco may be copying true_energy.")
    else:
        print("⚠️ WARNING: Not enough valid energy rows for correlation test!")

else:
    print("❌ ERROR: Could not resolve energy column names.")


Rows: 393483 | Columns: 35
First 30 columns: ['obs_id', 'event_id', 'true_az_tel', 'true_alt_tel', 'HillasReconstructor_core_x', 'HillasReconstructor_core_y', 'HillasReconstructor_h_max', 'true_az', 'true_alt', 'true_energy', 'log_true_energy', 'true_core_x', 'true_core_y', 'true_h_first_int', 'true_x_max', 'true_shower_primary_id', 'true_camera_x', 'true_camera_y', 'min_true_energy_cut', 'log_reco_energy', 'reco_energy', 'gammaness', 'camera_frame_hillas_intensity_tel1', 'camera_frame_hillas_width_tel1', 'camera_frame_hillas_length_tel1', 'leakage_intensity_width_2_tel1', 'camera_frame_hillas_intensity_tel2', 'camera_frame_hillas_width_tel2', 'camera_frame_hillas_length_tel2', 'leakage_intensity_width_2_tel2']
Missing expected columns: []
Energy columns used: true='true_energy', reco='reco_energy'
true_energy range [TeV]: 0.3180175721645355 → 299.97344970703125
reco_energy range [TeV]: 0.6018515435939218 → 279.7540775007091
Energy correlation (true vs reco): 0.9622999275398846


In [42]:
#cell 5
import numpy as np
import astropy.units as u

needed = ["true_alt", "true_az", "reco_alt", "reco_az", "gammaness"]
present = {c: (c in tab.colnames) for c in needed}

print("Column presence:", present)
missing = [c for c in needed if not present[c]]
if missing:
    print(f"⚠️ WARNING: Missing columns: {missing}")


# --------------------------
# Helpers
# --------------------------

def to_value_rad(col):
    """
    Convert angle column to radians.

    - If it's an astropy Quantity → convert to rad.
    - If it's a plain float array:
        * If max(abs(value)) > 10 → assume degrees and convert to rad.
        * Else → assume already in radians.
    """
    # Case 1: proper astropy Quantity
    try:
        return np.asarray(col.to_value(u.rad), dtype=float)
    except Exception:
        pass

    try:
        return np.asarray(col.quantity.to_value(u.rad), dtype=float)
    except Exception:
        pass

    # Case 2: plain floats, no units
    arr = np.asarray(col, dtype=float)
    if arr.size == 0:
        return arr

    max_abs = np.nanmax(np.abs(arr))
    if max_abs > 10.0:
        # Very likely degrees (0–90 or 0–360)
        return np.deg2rad(arr)

    # Otherwise, assume already radians
    return arr



def theta_from_altaz(true_alt, true_az, reco_alt, reco_az):
    talt = to_value_rad(true_alt)
    taz  = to_value_rad(true_az)
    ralt = to_value_rad(reco_alt)
    raz  = to_value_rad(reco_az)

    cos_th = np.sin(talt)*np.sin(ralt) + np.cos(talt)*np.cos(ralt)*np.cos(taz - raz)
    cos_th = np.clip(cos_th, -1.0, 1.0)
    return np.arccos(cos_th)


# --------------------------
# Compute theta
# --------------------------

if all(present[c] for c in ["true_alt","true_az","reco_alt","reco_az"]):

    theta_deg = np.rad2deg(theta_from_altaz(
        tab["true_alt"], tab["true_az"],
        tab["reco_alt"], tab["reco_az"]
    ))

    p50, p90 = np.nanpercentile(theta_deg, [50, 90])
    print(f"theta_deg p50={p50:.3f}, p90={p90:.3f}")

    # ---- Physics sanity warnings ----

    if p90 > 5:
        print("⚠️ WARNING: θ90 > 5° → reconstruction likely broken or wrong units.")

    if p50 < 0.005 and p90 < 0.05:
        print("⚠️ WARNING: θ values extremely small → reco_alt/az may equal true_alt/az (bug).")

    if np.nanmax(theta_deg) > 20:
        print("⚠️ WARNING: Some theta values exceed 20° → unrealistic for CTA DL2.")

    if np.nanmedian(np.abs(theta_deg)) > 2:
        print("⚠️ WARNING: Median angular error > 2° → reconstruction very poor.")

else:
    print("Theta cannot be computed — missing angle columns.")


# --------------------------
# Gammaness summary
# --------------------------

if present["gammaness"]:

    gh = np.asarray(tab["gammaness"], dtype=float)
    percs = {p: float(np.nanpercentile(gh, p)) for p in (50, 80, 90, 95, 99)}
    print("gammaness percentiles:", percs)

    # ---- Physics sanity warnings ----

    if np.nanmin(gh) < 0 or np.nanmax(gh) > 1:
        print("⚠️ WARNING: gammaness outside [0,1] → model output not normalized properly.")

    if percs[80] < 0.6:
        print("⚠️ WARNING: Gammaness too low at 80th percentile → classifier likely ineffective.")

    if percs[50] > 0.8:
        print("⚠️ WARNING: Gammaness distribution too high → possible bug (all events classified as gamma).")

else:
    print("No 'gammaness' column to summarize.")


Column presence: {'true_alt': True, 'true_az': True, 'reco_alt': True, 'reco_az': True, 'gammaness': True}
theta_deg p50=0.116, p90=0.401
gammaness percentiles: {50: 0.7627079732898246, 80: 0.8948992435384067, 90: 0.9347678921632108, 95: 0.9565131945032468, 99: 0.9802342503879063}


In [26]:
print([c for c in tab.colnames if "reco" in c.lower() or "hillas" in c.lower() or "reconstruct" in c.lower()])


['HillasReconstructor_core_x', 'HillasReconstructor_core_y', 'HillasReconstructor_h_max', 'log_reco_energy', 'reco_energy', 'camera_frame_hillas_intensity_tel1', 'camera_frame_hillas_width_tel1', 'camera_frame_hillas_length_tel1', 'camera_frame_hillas_intensity_tel2', 'camera_frame_hillas_width_tel2', 'camera_frame_hillas_length_tel2', 'reco_alt', 'reco_az', 'reco_ra', 'reco_dec']


In [43]:
# CELL 6 — Read simulation metadata and verify integrity

from ctapipe.io import read_table
import numpy as np
from pathlib import Path

def safe_read(path):
    """
    Safely attempt to read a table from the HDF5 file.
    Returns None if the node does not exist.
    """
    try:
        return read_table(first_file, path)
    except Exception:
        return None

print("Reading metadata from:", Path(first_file).name)

# ---------------------------------------------------------------
# 1) /simulation/service/shower_distribution 
#    This exists in sst1mpipe DL2 files and contains:
#      - simulated energy bin edges
#      - simulated core-distance bin edges
#      - histogram weights (NOT number of MC events)
# ---------------------------------------------------------------

sd = safe_read("/simulation/service/shower_distribution")
print("Has /simulation/service/shower_distribution:", sd is not None)

if sd is not None:
    print("\nColumns:", sd.colnames)

    # ---- MC energy binning (edges in TeV) ----
    if "bins_energy" in sd.colnames:
        eb = np.asarray(sd["bins_energy"][0], dtype=float)
        print("Energy bin edges (TeV): min =", np.min(eb), ", max =", np.max(eb))
    else:
        print("⚠ WARNING: 'bins_energy' missing")

    # ---- Core-distance binning ----
    if "bins_core_dist" in sd.colnames:
        cb = np.asarray(sd["bins_core_dist"][0], dtype=float)
        print("Max simulated core distance (m):", np.max(cb))
    else:
        print("⚠ WARNING: 'bins_core_dist' missing")

    # ---- Histogram counts ----
    if "n_entries" in sd.colnames:
        ne = np.asarray(sd["n_entries"])
        print("Histogram entries shape:", ne.shape)
        print("NOTE: sum(n_entries) =", int(np.sum(ne)),
              "→ NOT number of simulated showers!")
        print("      These are HISTOGRAM weights, not MC statistics.")
    else:
        print("⚠ WARNING: 'n_entries' missing")

    # ---- Viewcone ----
    if "viewcone" in sd.colnames:
        vc = float(sd["viewcone"][0])
        print("Viewcone (deg):", vc)
    else:
        print("No 'viewcone' stored → treated as ON-axis point-source MC.")


# ---------------------------------------------------------------
# 2) Check /simulation/run_config and /simulation/config
#    These are OPTIONAL in sst1mpipe and often missing.
#    If they exist, they may contain:
#      - spectral index
#      - true pointing info
#      - viewcone_min/max
# ---------------------------------------------------------------

print("\nChecking run_config and config tables:")

for alt in ("/simulation/run_config", "/simulation/config"):
    t = safe_read(alt)
    print(f"{alt} present:", t is not None)

    if t is not None:
        print("Columns:", t.colnames[:20])

        try:
            print("First row:")
            print(t[:1])
        except Exception:
            print("(Table does not support slicing)")

        print("NOTE: These tables do NOT contain n_showers or anything usable for MC counts.")


# ---------------------------------------------------------------
# Summary
# ---------------------------------------------------------------
print("\n=== SUMMARY ===")

if sd is None:
    print("❌ No shower_distribution metadata found → cannot build IRFs.")
else:
    print("✔ shower_distribution metadata present (energy/core bins OK).")

print("✔ Number of simulated events will be determined from DL2 rows (len(tab)).")
print("✔ Using spectral_index = -2.0 (not stored in metadata).")
print("✔ Using viewcone = 0 deg (point-source).")


Reading metadata from: gamma_200_300E3GeV_20_20deg_testing_dl1_dl2.h5
Has /simulation/service/shower_distribution: True

Columns: ['obs_id', 'hist_id', 'n_entries', 'bins_energy', 'bins_core_dist', 'histogram']
Energy bin edges (TeV): min = 0.001 , max = 1000.0
Max simulated core distance (m): 1760.0
Histogram entries shape: (7497,)
NOTE: sum(n_entries) = 349860000 → NOT number of simulated showers!
      These are HISTOGRAM weights, not MC statistics.
No 'viewcone' stored → treated as ON-axis point-source MC.

Checking run_config and config tables:
/simulation/run_config present: False
/simulation/config present: False

=== SUMMARY ===
✔ shower_distribution metadata present (energy/core bins OK).
✔ Number of simulated events will be determined from DL2 rows (len(tab)).
✔ Using spectral_index = -2.0 (not stored in metadata).
✔ Using viewcone = 0 deg (point-source).


In [64]:
# CELL 7 — Build SimulatedEventsInfo using proper MC statistics

import numpy as np
import astropy.units as u
import inspect
from pyirf.simulations import SimulatedEventsInfo

def build_simulated_events_info(sd, tab):
    """
    Construct a SimulatedEventsInfo object using SST1M metadata:

      - n_showers  from sum(sd['n_entries'])  (true MC statistics)
      - energy_min/max from sd['bins_energy']
      - max_impact from sd['bins_core_dist']
      - spectral_index = -2.0 (production spectrum, not stored explicitly)
      - viewcone = 0 deg (treated as point-source MC; diffuse angle not stored)

    This matches the way ctapipe / pyirf expect simulation_info to be filled.
    """

    # ------------------------------------
    # 1) Check metadata presence
    # ------------------------------------
    if sd is None:
        raise RuntimeError(
            "ERROR: No /simulation/service/shower_distribution table. "
            "Cannot build SimulatedEventsInfo."
        )

    # ------------------------------------
    # 2) Number of simulated showers
    #    -> use histogram statistics, NOT len(DL2)
    # ------------------------------------
    if "n_entries" not in sd.colnames:
        raise RuntimeError(
            "ERROR: 'n_entries' missing in shower_distribution — cannot get MC statistics."
        )

    n_showers_hist = int(np.nansum(np.asarray(sd["n_entries"])))
    n_dl2 = len(tab)

    print(f"✔ n_showers (from shower_distribution) = {n_showers_hist}")
    print(f"  DL2 events (triggered/kept)          = {n_dl2}")
    if n_dl2 > 0:
        frac = n_dl2 / n_showers_hist
        print(f"  Triggered fraction ≈ {frac:.3e}")
        if frac > 0.5:
            print("⚠ WARNING: Triggered fraction > 50% — check that n_entries really is MC stats.")
    n_showers = n_showers_hist

    # ------------------------------------
    # 3) Energy range (TeV)
    # ------------------------------------
    if "bins_energy" not in sd.colnames:
        raise RuntimeError("'bins_energy' missing in shower_distribution")

    eb = np.asarray(sd["bins_energy"][0], dtype=float)
    energy_min = np.min(eb) * u.TeV
    energy_max = np.max(eb) * u.TeV
    print("✔ Energy range (MC):", energy_min, "→", energy_max)

    # ------------------------------------
    # 4) Max impact parameter (m)
    # ------------------------------------
    if "bins_core_dist" not in sd.colnames:
        raise RuntimeError("'bins_core_dist' missing in shower_distribution")

    cb = np.asarray(sd["bins_core_dist"][0], dtype=float)
    max_impact = np.max(cb) * u.m
    print("✔ max_impact =", max_impact)

    # quick geometric sanity check
    a_geom = np.pi * max_impact**2
    print(f"  Geometric area π·R_max² ≈ {a_geom.to_value(u.m**2):.3e} m²")

    # ------------------------------------
    # 5) Spectral index
    #    Not stored in these files → use -2.0
    # ------------------------------------
    spectral_index = -2.0
    print("✔ spectral_index = -2.0 (default production spectrum)")

    # ------------------------------------
    # 6) Viewcone
    #    Not stored explicitly; treat as point-source
    # ------------------------------------
    viewcone_min = 0.0 * u.deg
    viewcone_max = 0.0 * u.deg
    print("✔ viewcone = 0 deg (treated as point-source MC)")

    # ------------------------------------
    # 7) Build SimulatedEventsInfo (handle both pyirf signatures)
    # ------------------------------------
    sig = inspect.signature(SimulatedEventsInfo).parameters
    params = set(sig.keys())

    kwargs = dict(
        n_showers=n_showers,
        energy_min=energy_min,
        energy_max=energy_max,
        max_impact=max_impact,
        spectral_index=spectral_index,
    )

    if {"viewcone_min", "viewcone_max"} <= params:
        kwargs.update(viewcone_min=viewcone_min, viewcone_max=viewcone_max)
    elif "viewcone" in params:
        kwargs.update(viewcone=viewcone_max)
    else:
        raise RuntimeError(f"Unexpected SimulatedEventsInfo signature: {params}")

    print("✔ SimulatedEventsInfo kwargs:", kwargs)

    return SimulatedEventsInfo(**kwargs)


# --------------------------------------------------------------------
# BUILD THE OBJECT (this runs with sd and tab loaded in previous cells)
# --------------------------------------------------------------------
sim_info = build_simulated_events_info(sd, tab)

print("\nFinal SimulatedEventsInfo:")
print(sim_info)


✔ n_showers (from shower_distribution) = 349860000
  DL2 events (triggered/kept)          = 393483
  Triggered fraction ≈ 1.125e-03
✔ Energy range (MC): 0.001 TeV → 1000.0 TeV
✔ max_impact = 1760.0 m
  Geometric area π·R_max² ≈ 9.731e+06 m²
✔ spectral_index = -2.0 (default production spectrum)
✔ viewcone = 0 deg (treated as point-source MC)
✔ SimulatedEventsInfo kwargs: {'n_showers': 349860000, 'energy_min': <Quantity 0.001 TeV>, 'energy_max': <Quantity 1000. TeV>, 'max_impact': <Quantity 1760. m>, 'spectral_index': -2.0, 'viewcone_min': <Quantity 0. deg>, 'viewcone_max': <Quantity 0. deg>}

Final SimulatedEventsInfo:
SimulatedEventsInfo(n_showers=349860000, energy_min=0.001 TeV, energy_max=1000.00 TeV, spectral_index=-2.0, max_impact=1760.00 m, viewcone_min=0.0 degviewcone_max=0.0 deg)


In [65]:
# CELL 8 — Create true-energy bins (true_bins) for IRF construction

import numpy as np
import astropy.units as u
from pyirf.binning import create_bins_per_decade

# ---------------------------------------------
# Preconditions: ensure required objects exist
# ---------------------------------------------
if "tab" not in globals():
    raise RuntimeError("DL2 table 'tab' not loaded — please run earlier cells.")
if "sd" not in globals():
    raise RuntimeError("MC metadata 'sd' missing — please run metadata cell first.")

# ---------------------------------------------
# Helper: convert energy to TeV
# ---------------------------------------------
def to_value_tev(col):
    try:
        return np.asarray(col.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    try:
        return np.asarray(col.quantity.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    return np.asarray(col, dtype=float)

# ---------------------------------------------
# 1) Extract true energy values
# ---------------------------------------------
e_true = to_value_tev(tab["true_energy"])
print(f"Loaded {len(e_true)} true-energy values from DL2.")

# ---------------------------------------------
# 2) Retrieve MC energy bin edges
# ---------------------------------------------
eb = np.asarray(sd["bins_energy"][0], dtype=float)
mc_emin = np.min(eb)
mc_emax = np.max(eb)
print(f"MC energy bin edges: {mc_emin:.3f} TeV → {mc_emax:.3f} TeV")

# ---------------------------------------------
# 3) Determine safe binning range
# ---------------------------------------------
dl2_emin = float(np.nanpercentile(e_true, 0.5))
dl2_emax = float(np.nanpercentile(e_true, 99.5))

emin = max(mc_emin, dl2_emin)
emax = min(mc_emax, dl2_emax)

# Clamp to avoid extremes
emin = max(emin, 0.01)     # minimum 10 GeV
emax = min(emax, 300.0)    # maximum 300 TeV (based on DL2)
print(f"Effective energy binning range: {emin:.3f} TeV → {emax:.3f} TeV")

# ---------------------------------------------
# 4) Create bins per decade
# ---------------------------------------------
true_bins = create_bins_per_decade(
    emin * u.TeV,
    emax * u.TeV,
    bins_per_decade=10   # ~0.1 dex resolution
)

print(f"✔ Created {len(true_bins)-1} true-energy bins")
print("First 6 bin-edges (TeV):", np.round(true_bins.to_value(u.TeV)[:6], 4))
print("Last 6 bin-edges  (TeV):", np.round(true_bins.to_value(u.TeV)[-6:], 4))


Loaded 393483 true-energy values from DL2.
MC energy bin edges: 0.001 TeV → 1000.000 TeV
Effective energy binning range: 0.878 TeV → 283.491 TeV
✔ Created 25 true-energy bins
First 6 bin-edges (TeV): [0.8784 1.1058 1.3921 1.7526 2.2064 2.7777]
Last 6 bin-edges  (TeV): [ 87.8386 110.5822 139.2148 175.261  220.6406 277.77  ]


In [66]:
# CELL 9 — Compute per-energy cuts on gammaness and theta

import numpy as np
import inspect
import astropy.units as u
from pyirf.cuts import calculate_percentile_cut, evaluate_binned_cut

# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------

if "true_bins" not in globals():
    raise RuntimeError("true_bins not defined — run Cell 8 first.")

if "tab" not in globals():
    raise RuntimeError("DL2 table 'tab' missing — run earlier cells.")

# ------------------------------------------------------------
# Extract DL2 columns
# ------------------------------------------------------------

e_true = to_value_tev(tab["true_energy"])
gh     = np.asarray(tab["gammaness"], dtype=float)

theta  = np.rad2deg(
            theta_from_altaz(
                tab["true_alt"], tab["true_az"],
                tab["reco_alt"], tab["reco_az"]
            )
         )

edges = true_bins.to_value(u.TeV)

print(f"Loaded {len(e_true)} events for cut computation.")

# ------------------------------------------------------------
# Target efficiencies
# ------------------------------------------------------------

gh_eff = 0.70   # keep upper 70% gammaness (stronger gamma/hadron separation)
th_eff = 0.70   # keep lower 70% theta (angular cut)

print(f"Using efficiencies: gammaness={gh_eff}, theta={th_eff}")

# ------------------------------------------------------------
# Helper: robust percentile cut for all pyirf versions
# ------------------------------------------------------------

def safe_percentile_cut(values, bin_values, bins, eff_keep, keep_upper=False):
    """
    robust wrapper around calculate_percentile_cut
    supports old/new pyirf API and includes fallback.
    """
    sig = inspect.signature(calculate_percentile_cut)
    target = (1 - eff_keep) if keep_upper else eff_keep

    try:
        # Newer pyirf API: uses "efficiency"
        if "efficiency" in sig.parameters:
            return calculate_percentile_cut(
                values=np.asarray(values),
                bins=np.asarray(bins),
                bin_values=np.asarray(bin_values),
                efficiency=target
            )
        # Older pyirf API: uses "percentile"
        else:
            return calculate_percentile_cut(
                values=np.asarray(values),
                bins=np.asarray(bins),
                bin_values=np.asarray(bin_values),
                percentile=target * 100.0
            )

    except Exception:
        # Fallback manual implementation
        print("⚠ WARNING: calculate_percentile_cut failed — using manual fallback.")
        bins = np.asarray(bins)
        cut = np.full(len(bins) - 1, np.nan)

        for i in range(len(bins) - 1):
            m = (bin_values >= bins[i]) & (bin_values < bins[i+1])
            if np.any(m):
                pct = (1-eff_keep)*100 if keep_upper else eff_keep*100
                cut[i] = np.percentile(values[m], pct)

        return cut

# ------------------------------------------------------------
# Compute gammaness and theta cuts
# ------------------------------------------------------------

gh_cut = safe_percentile_cut(
    values=gh,
    bin_values=e_true,
    bins=edges,
    eff_keep=gh_eff,
    keep_upper=True     # keep tail above threshold
)

th_cut = safe_percentile_cut(
    values=theta,
    bin_values=e_true,
    bins=edges,
    eff_keep=th_eff,
    keep_upper=False    # keep values below threshold
)

print("Median gh cut:", float(np.nanmedian(gh_cut)))
print("Median theta cut:", float(np.nanmedian(th_cut)), "deg")

# ------------------------------------------------------------
# Apply the cuts
# ------------------------------------------------------------

try:
    sel_gh = evaluate_binned_cut(
        values=gh, bins=edges,
        bin_values=e_true,
        cut=gh_cut,
        operator=">="
    )
    sel_th = evaluate_binned_cut(
        values=theta, bins=edges,
        bin_values=e_true,
        cut=th_cut,
        operator="<="
    )
except Exception:
    print("⚠ WARNING: evaluate_binned_cut failed — using manual fallback.")
    idx = np.digitize(e_true, edges) - 1

    sel_gh = np.zeros_like(gh, dtype=bool)
    sel_th = np.zeros_like(gh, dtype=bool)

    ok = (idx >= 0) & (idx < len(gh_cut))

    sel_gh[ok] = gh[ok] >= gh_cut[idx[ok]]
    sel_th[ok] = theta[ok] <= th_cut[idx[ok]]

sel = sel_gh & sel_th

# ------------------------------------------------------------
# Cut summary
# ------------------------------------------------------------

kept = np.count_nonzero(sel)
total = len(sel)
print(f"\nSelected events: {kept}/{total}  ({kept/total*100:.2f}%)")

# ------------------------------------------------------------
# Show first few bins for inspection
# ------------------------------------------------------------

bin_ids = np.digitize(e_true, edges) - 1
print("\nFirst 8 bins:")
for i in range(min(8, len(edges)-1)):
    c = np.count_nonzero(bin_ids == i)
    k = np.count_nonzero(sel & (bin_ids == i))
    print(f"  Bin {i}: [{edges[i]:.4f}, {edges[i+1]:.4f})  N={c:5d}  kept={k:5d}  gh_cut={gh_cut[i]:.3f}  th_cut={th_cut[i]:.3f}")


Loaded 393483 events for cut computation.
Using efficiencies: gammaness=0.7, theta=0.7
Median gh cut: 0.6442917522954298
Median theta cut: 0.18606500618258706 deg

Selected events: 214568/393483  (54.53%)

First 8 bins:
  Bin 0: [0.8784, 1.1058)  N= 5058  kept= 2643  gh_cut=0.598  th_cut=0.255
  Bin 1: [1.1058, 1.3921)  N=10659  kept= 5658  gh_cut=0.644  th_cut=0.215
  Bin 2: [1.3921, 1.7526)  N=15609  kept= 8200  gh_cut=0.683  th_cut=0.190
  Bin 3: [1.7526, 2.2064)  N=18391  kept= 9758  gh_cut=0.706  th_cut=0.171
  Bin 4: [2.2064, 2.7777)  N=20114  kept=10695  gh_cut=0.723  th_cut=0.161
  Bin 5: [2.7777, 3.4969)  N=21051  kept=11262  gh_cut=0.734  th_cut=0.151
  Bin 6: [3.4969, 4.4024)  N=21383  kept=11584  gh_cut=0.734  th_cut=0.144
  Bin 7: [4.4024, 5.5422)  N=21200  kept=11613  gh_cut=0.732  th_cut=0.141


In [68]:
# CELL 10 — Effective area before and after cuts

import numpy as np
import astropy.units as u
from astropy.table import Table
from pyirf.irf import effective_area_per_energy

# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------

if "sim_info" not in globals():
    raise RuntimeError("sim_info missing — run Cell 7.")
if "true_bins" not in globals():
    raise RuntimeError("true_bins missing — run Cell 8.")
if "sel" not in globals():
    raise RuntimeError("sel (selection mask) missing — run Cell 9.")
if "tab" not in globals():
    raise RuntimeError("DL2 table 'tab' missing — run earlier cells.")

# Helper (if not already available in this scope)
def to_value_tev(col):
    try:
        return np.asarray(col.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    try:
        return np.asarray(col.quantity.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    return np.asarray(col, dtype=float)

# ------------------------------------------------------------
# 1) Extract true energies
# ------------------------------------------------------------

true_e = to_value_tev(tab["true_energy"])

if np.any(~np.isfinite(true_e)):
    print("⚠ WARNING: Non-finite values found in true_energy; they will be ignored in histograms.")

print(f"Total DL2 events: {len(true_e)}")
print(f"Selected events (sel=True): {np.count_nonzero(sel)} "
      f"({np.count_nonzero(sel)/len(true_e)*100:.1f}%)")

# ------------------------------------------------------------
# 2) Build pyirf tables: before and after cuts
# ------------------------------------------------------------

t_all = Table()
t_all["true_energy"] = true_e * u.TeV

t_sel = Table()
t_sel["true_energy"] = true_e[sel] * u.TeV

# ------------------------------------------------------------
# 3) Compute effective area
# ------------------------------------------------------------

print("\nComputing Aeff BEFORE cuts...")
aeff_all = effective_area_per_energy(
    selected_events=t_all,
    simulation_info=sim_info,
    true_energy_bins=true_bins,
).to_value(u.m**2)

print("Computing Aeff AFTER cuts...")
aeff_sel = effective_area_per_energy(
    selected_events=t_sel,
    simulation_info=sim_info,
    true_energy_bins=true_bins,
).to_value(u.m**2)

edges_TeV = true_bins.to_value(u.TeV)

# ------------------------------------------------------------
# 4) Sanity checks and summary
# ------------------------------------------------------------

print("\n=== Aeff summary ===")
print("Aeff BEFORE cuts shape:", aeff_all.shape)
print("Aeff AFTER  cuts shape:", aeff_sel.shape)

print("Non-zero bins BEFORE cuts:", int(np.count_nonzero(aeff_all)), "/", len(aeff_all))
print("Non-zero bins AFTER  cuts:", int(np.count_nonzero(aeff_sel)), "/", len(aeff_sel))

max_all = float(np.nanmax(aeff_all))
max_sel = float(np.nanmax(aeff_sel))

print("\nMax Aeff BEFORE cuts: {:.1f} m²".format(max_all))
print("Max Aeff AFTER  cuts: {:.1f} m²".format(max_sel))

if max_all < 10:
    print("⚠ WARNING: Aeff before cuts extremely small (<10 m²) — check n_showers or impact range.")
if max_all > 3e7:
    print("⚠ WARNING: Aeff before cuts > 3e7 m² — suspiciously large for SST-1M.")

if max_sel < 1:
    print("⚠ WARNING: Aeff after cuts extremely small (<1 m²) — cuts might be too strong.")
if max_sel > max_all * 1.01:
    print("⚠ WARNING: Aeff after cuts exceeds Aeff before cuts somewhere — something is inconsistent.")

# ------------------------------------------------------------
# 5) Print first few bins for inspection
# ------------------------------------------------------------

print("\nFirst 8 bins (Etrue, Aeff_all, Aeff_sel, eff_sel):")
for i in range(min(8, len(aeff_all))):
    e_min = edges_TeV[i]
    e_max = edges_TeV[i+1]
    aa = aeff_all[i]
    asel = aeff_sel[i]
    if aa > 0:
        eff = asel / aa
    else:
        eff = np.nan
    print(f"  [{e_min:.3f}, {e_max:.3f}) TeV  "
          f"Aeff_all={aa:9.1f} m²  Aeff_sel={asel:9.1f} m²  eff_sel={eff:5.3f}")

print("\nDone.")


Total DL2 events: 393483
Selected events (sel=True): 214568 (54.5%)

Computing Aeff BEFORE cuts...
Computing Aeff AFTER cuts...

=== Aeff summary ===
Aeff BEFORE cuts shape: (25,)
Aeff AFTER  cuts shape: (25,)
Non-zero bins BEFORE cuts: 25 / 25
Non-zero bins AFTER  cuts: 25 / 25

Max Aeff BEFORE cuts: 236537523.8 m²
Max Aeff AFTER  cuts: 129921077.2 m²

First 8 bins (Etrue, Aeff_all, Aeff_sel, eff_sel):
  [0.878, 1.106) TeV  Aeff_all= 600855.5 m²  Aeff_sel= 313970.1 m²  eff_sel=0.523
  [1.106, 1.392) TeV  Aeff_all=1594071.0 m²  Aeff_sel= 846163.2 m²  eff_sel=0.531
  [1.392, 1.753) TeV  Aeff_all=2938774.5 m²  Aeff_sel=1543849.8 m²  eff_sel=0.525
  [1.753, 2.206) TeV  Aeff_all=4359097.0 m²  Aeff_sel=2312874.1 m²  eff_sel=0.531
  [2.206, 2.778) TeV  Aeff_all=6001912.1 m²  Aeff_sel=3191331.9 m²  eff_sel=0.532
  [2.778, 3.497) TeV  Aeff_all=7907950.1 m²  Aeff_sel=4230646.2 m²  eff_sel=0.535
  [3.497, 4.402) TeV  Aeff_all=10112530.0 m²  Aeff_sel=5478349.5 m²  eff_sel=0.542
  [4.402, 5.542) T

In [69]:
# ================================================================
# CELL — Load matching proton DL2 file (for background estimation)
# ================================================================

import re, glob, h5py
from pathlib import Path
import numpy as np
import astropy.units as u
from ctapipe.io import read_table

print("Gamma file:", Path(first_file).name)

# ----------------------------------------------------------
# 1) Extract zenith/azimuth token from gamma filename
#    (e.g. _20_20deg_)
# ----------------------------------------------------------
name = Path(first_file).name
m = re.search(r"_[0-9]{1,2}_[0-9]{1,2}deg_", name)
zen_token = m.group(0) if m else ""
print("Zenith token:", zen_token or "(none found)")

# ----------------------------------------------------------
# 2) Find proton files matching the same zenith configuration
# ----------------------------------------------------------

if zen_token:
    proton_files = sorted(glob.glob(str(Path(DL2_DIR) / f"proton*{zen_token}*.h5")))
else:
    proton_files = []

# Fallback: any proton file
if not proton_files:
    print("⚠ WARNING: No zenith-matched proton files found — using all proton_*.h5")
    proton_files = sorted(glob.glob(str(Path(DL2_DIR) / "proton_*.h5")))

print(f"Found {len(proton_files)} proton files")
print("Examples:", [Path(f).name for f in proton_files[:5]])

assert proton_files, "❌ ERROR: No proton files available."

# Use the first matching proton file
pfile = proton_files[0]
print("\nUsing proton file:", Path(pfile).name)

# ----------------------------------------------------------
# 3) List DL2 parameter tables in proton file
# ----------------------------------------------------------
def list_parameter_tables(h5file):
    out = []
    with h5py.File(h5file, "r") as h5:
        base = "/dl2/event/telescope/parameters"
        if base in h5:
            out = [f"{base}/{k}" for k in h5[base].keys()]
    return out

pcands = list_parameter_tables(pfile)
print("Proton parameter tables:", pcands)

# Prefer stereo reconstruction if available
ptable = next((p for p in pcands if p.endswith("/stereo")),
              (pcands[0] if pcands else None))

assert ptable, "❌ ERROR: No proton parameter table found."
print("Selected proton table:", ptable)

# ----------------------------------------------------------
# 4) Load proton DL2 table
# ----------------------------------------------------------
ptab = read_table(pfile, ptable)
print(f"Proton rows: {len(ptab)} | Columns: {len(ptab.colnames)}")

# ----------------------------------------------------------
# 5) Check required physics columns
# ----------------------------------------------------------
needed = ["reco_energy", "gammaness", "true_alt", "true_az", "reco_alt", "reco_az"]
missing = [c for c in needed if c not in ptab.colnames]

print("Missing expected proton columns:", missing)
if missing:
    print("⚠ WARNING: Proton DL2 file missing necessary columns; "
          "background estimation may be incomplete.")

# ----------------------------------------------------------
# 6) Sanity: convert reco_energy to TeV, same logic as gamma cell
# ----------------------------------------------------------
def to_value_tev(col):
    try:
        return np.asarray(col.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    try:
        return np.asarray(col.quantity.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    return np.asarray(col, dtype=float)

if "reco_energy" in ptab.colnames:
    ereco = to_value_tev(ptab["reco_energy"])
    print("Proton reco_energy range [TeV]:",
          float(np.nanmin(ereco)), "→", float(np.nanmax(ereco)))

    # unit sanity check
    if np.nanmax(ereco) > 500:
        print("⚠ WARNING: Proton reco_energy > 500 TeV — values likely in GeV, "
              "check units or conversion.")
else:
    print("❌ ERROR: No 'reco_energy' column in proton table.")
    ereco = None

print("\nProton DL2 loaded successfully.")


Gamma file: gamma_200_300E3GeV_20_20deg_testing_dl1_dl2.h5
Zenith token: _20_20deg_
Found 1 proton files
Examples: ['proton_400_500E3GeV_20_20deg_testing_dl1_dl2.h5']

Using proton file: proton_400_500E3GeV_20_20deg_testing_dl1_dl2.h5
Proton parameter tables: ['/dl2/event/telescope/parameters/stereo']
Selected proton table: /dl2/event/telescope/parameters/stereo
Proton rows: 2754879 | Columns: 35
Missing expected proton columns: []
Proton reco_energy range [TeV]: 0.5921603736083508 → 284.53092990263553

Proton DL2 loaded successfully.


In [70]:
# ============================================================
# CELL — Apply gamma cuts to protons & build background shape
# ============================================================

import numpy as np
import astropy.units as u
from pyirf.cuts import evaluate_binned_cut
from pyirf.binning import create_bins_per_decade

# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------
if "ptab" not in globals():
    raise RuntimeError("Proton DL2 table 'ptab' not loaded — run proton loader cell first.")
if "tab" not in globals():
    raise RuntimeError("Gamma DL2 table 'tab' not loaded.")
if "true_bins" not in globals():
    raise RuntimeError("true_bins not defined — run energy-binning cell first.")
if "gh_cut" not in globals() or "th_cut" not in globals():
    raise RuntimeError("gh_cut / th_cut not defined — run gamma cut cell first.")

# Use the same true-energy bin edges as for gamma cuts
e_true_edges_TeV = true_bins.to_value(u.TeV)

# ------------------------------------------------------------
# Helper functions (same style as earlier cells)
# ------------------------------------------------------------

def to_value_rad(col):
    try:
        return np.asarray(col.to_value(u.rad))
    except Exception:
        pass
    try:
        return np.asarray(col.quantity.to_value(u.rad))
    except Exception:
        pass
    return np.asarray(col)

def to_value_tev(col):
    try:
        return np.asarray(col.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    try:
        return np.asarray(col.quantity.to_value(u.TeV), dtype=float)
    except Exception:
        pass
    return np.asarray(col, dtype=float)

def theta_from_altaz(true_alt, true_az, reco_alt, reco_az):
    talt = to_value_rad(true_alt); taz = to_value_rad(true_az)
    ralt = to_value_rad(reco_alt); raz = to_value_rad(reco_az)
    cos_th = (
        np.sin(talt) * np.sin(ralt)
        + np.cos(talt) * np.cos(ralt) * np.cos(taz - raz)
    )
    cos_th = np.clip(cos_th, -1.0, 1.0)
    return np.arccos(cos_th)

# ------------------------------------------------------------
# 1) Apply gamma-derived cuts to proton events (per TRUE-E bin)
# ------------------------------------------------------------

p_true = to_value_tev(ptab["true_energy"])
p_gh   = np.asarray(ptab["gammaness"], dtype=float)
p_th   = np.rad2deg(
    theta_from_altaz(
        ptab["true_alt"], ptab["true_az"],
        ptab["reco_alt"], ptab["reco_az"]
    )
)

try:
    p_sel_gh = evaluate_binned_cut(
        values=p_gh,
        bins=e_true_edges_TeV,
        bin_values=p_true,
        cut=gh_cut,
        operator=">="
    )
    p_sel_th = evaluate_binned_cut(
        values=p_th,
        bins=e_true_edges_TeV,
        bin_values=p_true,
        cut=th_cut,
        operator="<="
    )
except Exception:
    print("⚠ WARNING: evaluate_binned_cut failed for protons — using manual per-bin fallback.")
    idx = np.digitize(p_true, e_true_edges_TeV) - 1
    p_sel_gh = np.zeros_like(p_gh, dtype=bool)
    p_sel_th = np.zeros_like(p_gh, dtype=bool)

    good = (idx >= 0) & (idx < len(gh_cut))
    p_sel_gh[good] = p_gh[good] >= gh_cut[idx[good]]
    p_sel_th[good] = p_th[good] <= th_cut[idx[good]]

p_sel = p_sel_gh & p_sel_th

n_p_sel = int(np.count_nonzero(p_sel))
n_p_tot = len(p_sel)
print(f"Proton selected: {n_p_sel} / {n_p_tot}  ({100*n_p_sel/n_p_tot:.2f}%)")

# ------------------------------------------------------------
# 2) Build BACKGROUND SHAPE over RECO-energy bins
#    NOTE: We fix only the SHAPE here; absolute normalization
#          is controlled by TOTAL_RATE_TARGET_HZ (placeholder).
# ------------------------------------------------------------

p_reco = to_value_tev(ptab["reco_energy"])

# "Safe" reco-energy range from percentiles
pr_emin = max(0.05, float(np.nanpercentile(p_reco, 0.5)))
pr_emax = min(60.0, float(np.nanpercentile(p_reco, 99.5)))

e_reco_bins = create_bins_per_decade(
    (pr_emin * u.TeV),
    (pr_emax * u.TeV),
    bins_per_decade=8,
).to_value(u.TeV)

counts_sel, _ = np.histogram(p_reco[p_sel], bins=e_reco_bins)

# ------------------------------------------------------------
# 3) Set absolute background scale (placeholder)
# ------------------------------------------------------------
# This normalizes the spectrum so that the SUM over all bins is
# TOTAL_RATE_TARGET_HZ. The SHAPE is correct from MC; the scale
# is arbitrary here and should be tuned to a realistic rate
# (e.g. from data, or from a known background model).
TOTAL_RATE_TARGET_HZ = 20.0

if counts_sel.sum() == 0:
    print("⚠ WARNING: No selected proton events — background rates set to ~0.")
    rates = np.full(len(e_reco_bins) - 1, 1e-12)
else:
    rates = TOTAL_RATE_TARGET_HZ * counts_sel.astype(float) / counts_sel.sum()
    rates = np.maximum(rates, 1e-12)

print(f"\nReco-energy bins: {len(e_reco_bins)-1}")
print("Nonzero rate bins:", int(np.count_nonzero(rates)))
print("First 10 bins summary (Ereco, N_sel, rate):")
for i in range(min(10, len(rates))):
    print(
        f"  [{e_reco_bins[i]:.3f}, {e_reco_bins[i+1]:.3f}) TeV  "
        f"N_sel={counts_sel[i]:7d}  rate={rates[i]:.6f} Hz"
    )

# ------------------------------------------------------------
# 4) Display "representative" cuts (for CSV metadata)
# ------------------------------------------------------------
gh_display = float(np.nanmedian(gh_cut))
th_display = float(np.nanmedian(th_cut))
print(f"\nDisplay cuts → Gammaness_cut={gh_display:.3f}, Theta_cut_deg={th_display:.3f}")


Proton selected: 158 / 2754879  (0.01%)

Reco-energy bins: 13
Nonzero rate bins: 13
First 10 bins summary (Ereco, N_sel, rate):
  [1.201, 1.602) TeV  N_sel=      1  rate=0.196078 Hz
  [1.602, 2.136) TeV  N_sel=      5  rate=0.980392 Hz
  [2.136, 2.848) TeV  N_sel=      4  rate=0.784314 Hz
  [2.848, 3.798) TeV  N_sel=      6  rate=1.176471 Hz
  [3.798, 5.065) TeV  N_sel=      3  rate=0.588235 Hz
  [5.065, 6.754) TeV  N_sel=      9  rate=1.764706 Hz
  [6.754, 9.007) TeV  N_sel=     11  rate=2.156863 Hz
  [9.007, 12.011) TeV  N_sel=     14  rate=2.745098 Hz
  [12.011, 16.016) TeV  N_sel=     10  rate=1.960784 Hz
  [16.016, 21.358) TeV  N_sel=      8  rate=1.568627 Hz

Display cuts → Gammaness_cut=0.644, Theta_cut_deg=0.186


In [None]:
import re
import numpy as np
import pandas as pd
from pathlib import Path
import astropy.units as u

# --- required inputs: first_file, true_bins, aeff_sel, mu_loc, mu_scale, mu_a, model
#                      ptab, e_reco_bins, rates, gh_display, th_display

# ------------------------------
# 1) Extract zenith angle
# ------------------------------
name = Path(first_file).name
m = re.search(r"_([0-9]{1,2})_([0-9]{1,2})deg_", name)
zenith = int(m.group(1)) if m else -1
print("Zenith angle (deg):", zenith)

e_true_edges_TeV = true_bins.to_value(u.TeV)

# ------------------------------
# Migration model mapping
# ------------------------------
def map_model(x):
    if x is None:
        return "Gaus"
    x = str(x).lower()
    if "log" in x:
        return "LogNorm"
    if "gau" in x or "norm" in x:
        return "Gaus"
    return "Gaus"  # fallback


# ------------------------------
# 2) Build gamma IRF CSV
# ------------------------------
gamma_rows = []
for i in range(len(e_true_edges_TeV) - 1):

    loc  = mu_loc[i]   if np.isfinite(mu_loc[i])   else 0.0
    scale= mu_scale[i] if np.isfinite(mu_scale[i]) else 0.3
    aparam = mu_a[i]   if np.isfinite(mu_a[i])     else 2.0
    model_label = map_model(model[i])

    aeff_val = max(0.0, float(aeff_sel[i]))

    row = {
        "SPECIES": "gamma",
        "ZD_deg":  float(zenith),
        "Etrue_min_TeV": float(e_true_edges_TeV[i]),
        "Etrue_max_TeV": float(e_true_edges_TeV[i+1]),
        "Aeff_m2": aeff_val,
        "emig_mu_loc": float(loc),
        "emig_mu_scale": float(scale),
        "emig_mu_a": float(aparam),
        "emig_model": model_label,
    }
    gamma_rows.append(row)

df_gamma = pd.DataFrame(gamma_rows)


# ------------------------------
# 3) Build background IRF CSV
# ------------------------------
back_rows = []
for i in range(len(e_reco_bins) - 1):
    row = {
        "SPECIES": "proton",
        "ZD_deg": float(zenith),
        "Ereco_min_TeV": float(e_reco_bins[i]),
        "Ereco_max_TeV": float(e_reco_bins[i+1]),
        "BckgRate_per_second": float(rates[i]),
        "Theta_cut_deg": float(th_display),
        "Gammaness_cut": float(gh_display),
    }
    back_rows.append(row)

df_back = pd.DataFrame(back_rows)


# ------------------------------
# 4) Save
# ------------------------------
OUTPUT_DIR = Path("./SST1M_csv")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

tag = f"gh70_th70"

gamma_csv = OUTPUT_DIR / f"SST1M_gamma_ZD{zenith}_{tag}.csv"
backg_csv = OUTPUT_DIR / f"SST1M_backg_ZD{zenith}_{tag}.csv"

df_gamma.to_csv(gamma_csv, index=False)
df_back.to_csv(backg_csv, index=False)

print("Saved gamma CSV:", gamma_csv)
print("Saved backg CSV:", backg_csv)

print("\nGamma head:\n", df_gamma.head(3))
print("\nBackground head:\n", df_back.head(3))


Zenith angle (deg): 20
Saved gamma CSV: SST1M_csv\SST1M_gamma_irf_gheffi_0.70_theffi_0.70.csv
Saved backg CSV: SST1M_csv\SST1M_backg_irf_gheffi_0.70_theffi_0.70.csv
Gamma head:
    ZD_deg  Etrue_min_TeV  Etrue_max_TeV       Aeff_m2  emig_mu_loc  \
0    20.0       0.988613       1.244591  5.523159e+05     0.032257   
1    20.0       1.244591       1.566847  1.168805e+06     0.001696   
2    20.0       1.566847       1.972543  1.915582e+06    -0.007345   

   emig_mu_scale  emig_mu_a      emig_model  
0       0.100580    1.98594  log10_skewnorm  
1       0.090290    1.54035  log10_skewnorm  
2       0.040725        NaN     log10_moyal  
Backg head:
    ZD_deg  Ereco_min_TeV  Ereco_max_TeV  BckgRate_per_second  Theta_cut_deg  \
0    20.0       1.201065       1.601645             0.904491       13.27004   
1    20.0       1.601645       2.135828             1.590875       13.27004   
2    20.0       2.135828       2.848173             1.655004       13.27004   

   Gammaness_cut  
0       

In [None]:
# ONE-CELL PIPELINE: build IRF CSVs for all zeniths using gamma + gamma_point (signal) and proton (background)
# Efficiencies: 0.40, 0.70, 0.90
# Output: ./final_CSV/SST1M_gamma_irf_gheffi_XX_theffi_XX.csv and SST1M_backg_irf_gheffi_XX_theffi_XX.csv

import re, glob, inspect
from pathlib import Path

import numpy as np
import pandas as pd
import astropy.units as u
from astropy.table import vstack, Table
import h5py

from ctapipe.io import read_table

from pyirf.cuts import calculate_percentile_cut, evaluate_binned_cut
from pyirf.binning import create_bins_per_decade
from pyirf.irf import effective_area_per_energy
from pyirf.simulations import SimulatedEventsInfo

from scipy.stats import moyal, skewnorm  # import once, not inside the loop

# ---------------- utilities ----------------

def zd_from_filename(path: str):
    """Extract zenith angle in deg from filename like *_20_20deg_*.h5"""
    m = re.search(r"_([0-9]{1,2})_([0-9]{1,2})deg_", Path(path).name)
    if not m:
        return None
    return float(int(m.group(1)))


def to_value_tev(col):
    """Robust conversion of a column to float array in TeV."""
    for attr in ("to_value", "quantity"):
        try:
            if attr == "to_value":
                return np.asarray(col.to_value(u.TeV), dtype=float)
            return np.asarray(col.quantity.to_value(u.TeV), dtype=float)
        except Exception:
            pass
    return np.asarray(col, dtype=float)


def to_value_rad(col):
    """Robust conversion of an angle column to radians (float array)."""
    for attr in ("to_value", "quantity"):
        try:
            if attr == "to_value":
                return np.asarray(col.to_value(u.rad))
            return np.asarray(col.quantity.to_value(u.rad))
        except Exception:
            pass
    return np.asarray(col)


def theta_from_altaz(true_alt, true_az, reco_alt, reco_az):
    """Angular separation between true and reconstructed directions, using alt/az."""
    talt = to_value_rad(true_alt)
    taz  = to_value_rad(true_az)
    ralt = to_value_rad(reco_alt)
    raz  = to_value_rad(reco_az)
    cos_th = (
        np.sin(talt) * np.sin(ralt)
        + np.cos(talt) * np.cos(ralt) * np.cos(taz - raz)
    )
    return np.arccos(np.clip(cos_th, -1.0, 1.0))


def read_param_table(path: str):
    """
    Load a DL2 parameter table from a HDF5 DL2 file.
    Prefer /dl2/event/telescope/parameters/stereo if present,
    otherwise use the first available parameter table.
    """
    with h5py.File(path, "r") as h5:
        base = "/dl2/event/telescope/parameters"
        if base not in h5:
            raise KeyError(f"{base} missing in {path}")

        # prefer stereo
        candidates = [f"{base}/stereo"] + [
            f"{base}/{k}" for k in h5[base].keys() if k != "stereo"
        ]
        for p in candidates:
            if p in h5:
                return read_table(path, p)

    raise KeyError("No parameters table found")


def stack_tables(paths):
    """
    Read and vertically stack multiple DL2 parameter tables into one astropy Table.
    Metadata is cleared to avoid conflicts.
    """
    tabs = []
    for p in paths:
        try:
            t = read_param_table(p)
            t.meta = {}  # drop metadata to avoid conflicts in vstack
            tabs.append(t)
        except Exception as e:
            print(f"[skip] {Path(p).name}: {e}")
    if not tabs:
        return None
    return vstack(tabs, metadata_conflicts="silent")




def as_float_array(x):
    """
    Convert the output of calculate_percentile_cut or similar
    to a simple 1D float numpy array, regardless of structure.
    """
    if hasattr(x, "colnames"):
        name = "cut" if "cut" in x.colnames else x.colnames[0]
        col = x[name]
        try:
            return np.asarray(col.to_value(u.one), dtype=float)
        except Exception:
            return np.asarray(col, dtype=float)

    if isinstance(x, np.ndarray) and x.dtype.names:
        fld = "cut" if "cut" in x.dtype.names else x.dtype.names[0]
        return np.asarray(x[fld], dtype=float)

    try:
        return np.asarray(x.to_value(u.one), dtype=float)
    except Exception:
        return np.asarray(x, dtype=float)


def percentile_cut_compat(values, bin_values, edges, eff_keep, upper_tail=False):
    """
    Wrapper around pyirf.calculate_percentile_cut with:
      - compatibility for both 'efficiency'/'percentile' signatures
      - manual per-bin percentile fallback if it crashes
    """
    values = np.asarray(values, float)
    bin_values = np.asarray(bin_values, float)
    edges = np.asarray(edges, float)

    sig = inspect.signature(calculate_percentile_cut)
    params = set(sig.parameters.keys())
    kwargs = dict(
        values=values,
        bins=edges,
        bin_values=bin_values,
    )

    target = (1.0 - eff_keep) if upper_tail else eff_keep

    if "efficiency" in params:
        kwargs["efficiency"] = target
    else:
        kwargs["percentile"] = target * 100.0

    if "fill_value" in params:
        kwargs["fill_value"] = np.nan

    try:
        out = calculate_percentile_cut(**kwargs)
        return as_float_array(out)
    except Exception:
        # manual fallback: per-bin percentile
        print("⚠ WARNING: calculate_percentile_cut failed — using manual per-bin percentile fallback.")
        cut = np.full(len(edges) - 1, np.nan)
        pct = (1.0 - eff_keep) * 100.0 if upper_tail else eff_keep * 100.0

        for i in range(len(edges) - 1):
            m = (bin_values >= edges[i]) & (bin_values < edges[i + 1]) & np.isfinite(values)
            if np.any(m):
                cut[i] = np.nanpercentile(values[m], pct)

        return cut


def eval_binned(values, bin_values, edges, cut, op):
    """
    Wrapper around pyirf.evaluate_binned_cut with robust manual fallback.
    """
    values = np.asarray(values, float)
    bin_values = np.asarray(bin_values, float)
    edges = np.asarray(edges, float)
    cut_arr = as_float_array(cut)

    try:
        return evaluate_binned_cut(
            values=values,
            bins=edges,
            bin_values=bin_values,
            cut=cut_arr,
            operator=op,
        )
    except Exception:
        print("⚠ WARNING: evaluate_binned_cut failed — using manual selection fallback.")
        idx = np.digitize(bin_values, edges) - 1
        sel = np.zeros_like(values, dtype=bool)
        good = (idx >= 0) & (idx < len(cut_arr))
        if op == ">=":
            sel[good] = values[good] >= cut_arr[idx[good]]
        else:
            sel[good] = values[good] <= cut_arr[idx[good]]
        return sel


# ---------------- gather files by zenith ----------------

DL2 = Path(DL2_DIR)

gamma_files       = sorted(glob.glob(str(DL2 / "gamma_*.h5")))
gamma_point_files = sorted(glob.glob(str(DL2 / "gamma_point_*.h5")))
proton_files      = sorted(glob.glob(str(DL2 / "proton_*.h5")))

def group_by_zenith(paths):
    d = {}
    for p in paths:
        z = zd_from_filename(p)
        if z is None:
            continue
        d.setdefault(z, []).append(p)
    return d

G  = group_by_zenith(gamma_files)
GP = group_by_zenith(gamma_point_files)
P  = group_by_zenith(proton_files)

zeniths = sorted(set(G.keys()) | set(GP.keys()) | set(P.keys()))
if not zeniths:
    raise RuntimeError("No DL2 files grouped by zenith found.")


# ---------------- build CSVs per efficiency, appending all zeniths ----------------

OUT = Path("./final_CSV")
OUT.mkdir(parents=True, exist_ok=True)

eff_list = [0.40, 0.70, 0.90]

for eff in eff_list:
    gamma_csv = OUT / f"SST1M_gamma_irf_gheffi_{eff:.2f}_theffi_{eff:.2f}.csv"
    backg_csv = OUT / f"SST1M_backg_irf_gheffi_{eff:.2f}_theffi_{eff:.2f}.csv"

    # fresh start for this efficiency
    if gamma_csv.exists():
        gamma_csv.unlink()
    if backg_csv.exists():
        backg_csv.unlink()

    for zd in zeniths:
        g_paths = G.get(zd, []) + GP.get(zd, [])
        p_paths = P.get(zd, [])

        if not g_paths or not p_paths:
            print(f"[skip zd={zd}] gamma_paths={len(g_paths)} proton_paths={len(p_paths)}")
            continue

        g_tab = stack_tables(g_paths)
        p_tab = stack_tables(p_paths)
        if g_tab is None or p_tab is None:
            print(f"[skip zd={zd}] could not stack tables")
            continue

        # ------ extract basic arrays ------
        g_true = to_value_tev(g_tab["true_energy"])
        g_reco = to_value_tev(g_tab["reco_energy"])
        g_gh   = np.asarray(g_tab["gammaness"], float)
        g_th   = np.rad2deg(
            theta_from_altaz(
                g_tab["true_alt"], g_tab["true_az"],
                g_tab["reco_alt"], g_tab["reco_az"],
            )
        )

        p_true = to_value_tev(p_tab["true_energy"])
        p_reco = to_value_tev(p_tab["reco_energy"])
        p_gh   = np.asarray(p_tab["gammaness"], float)
        p_th   = np.rad2deg(
            theta_from_altaz(
                p_tab["true_alt"], p_tab["true_az"],
                p_tab["reco_alt"], p_tab["reco_az"],
            )
        )

        # ------ simulation info (gamma MC) ------
        # Use only DL2 truth values, DO NOT use shower_distribution metadata
        n_showers = len(g_true)   # simulated events

        sim_g = SimulatedEventsInfo(
            n_showers=n_showers,
            energy_min=np.nanmin(g_true) * u.TeV,
            energy_max=np.nanmax(g_true) * u.TeV,
            max_impact=300.0 * u.m,        # SST-1M standard default
            spectral_index=-2.0,
            viewcone_min=0.0 * u.deg,
            viewcone_max=0.0 * u.deg
        )



        # ------ binning in TRUE energy (for Aeff + migration) ------
        true_bins = create_bins_per_decade(1 * u.TeV, 300.0 * u.TeV, bins_per_decade=10)
        e_true_edges = true_bins.to_value(u.TeV)


        # ------ binning in RECO energy (for background) ------
        r_emin = max(0.05, float(np.nanpercentile(p_reco, 0.5)))
        r_emax = min(80.0, float(np.nanpercentile(p_reco, 99.7)))
        reco_bins = create_bins_per_decade((r_emin * u.TeV), (r_emax * u.TeV), bins_per_decade=8)
        e_reco_edges = reco_bins.to_value(u.TeV)

        # --------------------------------------------------
        # BACKGROUND: cuts vs RECO energy (for bkg spectrum)
        # --------------------------------------------------
        gh_cut_reco = percentile_cut_compat(
            g_gh, g_reco, e_reco_edges,
            eff_keep=eff,
            upper_tail=True,     # keep upper tail in gammaness
        )
        th_cut_reco = percentile_cut_compat(
            g_th, g_reco, e_reco_edges,
            eff_keep=eff,
            upper_tail=False,    # keep lower tail in theta
        )

        p_sel_gh = eval_binned(p_gh, p_reco, e_reco_edges, gh_cut_reco, op=">=")
        p_sel_th = eval_binned(p_th, p_reco, e_reco_edges, th_cut_reco, op="<=")
        p_sel = p_sel_gh & p_sel_th

        counts_sel, _ = np.histogram(p_reco[p_sel], bins=e_reco_edges)

        # total background rate (normalization knob)
        total_rate_hz = 20.0
        if counts_sel.sum() == 0:
            rates = np.full(len(e_reco_edges) - 1, 1e-12)
            print(f"⚠ WARNING: No selected protons at zd={zd}, eff={eff:.2f} — setting tiny background.")
        else:
            rates = total_rate_hz * counts_sel.astype(float) / counts_sel.sum()
            rates = np.maximum(rates, 1e-12)

        df_back = pd.DataFrame({
            "ZD_deg": np.full(len(e_reco_edges) - 1, zd, dtype=float),
            "Ereco_min_TeV": e_reco_edges[:-1].astype(float),
            "Ereco_max_TeV": e_reco_edges[1:].astype(float),
            "BckgRate_per_second": rates.astype(float),
            "Theta_cut_deg": th_cut_reco.astype(float),
            "Gammaness_cut": gh_cut_reco.astype(float),
        })

        df_back.to_csv(backg_csv, mode="a", header=not backg_csv.exists(), index=False)

        # --------------------------------------------------
        # GAMMA: cuts vs TRUE energy (for Aeff & migration)
        # --------------------------------------------------
        gh_cut_true = percentile_cut_compat(
            g_gh, g_true, e_true_edges,
            eff_keep=eff,
            upper_tail=True,
        )
        th_cut_true = percentile_cut_compat(
            g_th, g_true, e_true_edges,
            eff_keep=eff,
            upper_tail=False,
        )

        g_sel_gh = eval_binned(g_gh, g_true, e_true_edges, gh_cut_true, op=">=")
        g_sel_th = eval_binned(g_th, g_true, e_true_edges, th_cut_true, op="<=")
        g_sel = g_sel_gh & g_sel_th

        t_selected = Table()
        t_selected["true_energy"] = (g_true[g_sel] * u.TeV)
        aeff_sel = effective_area_per_energy(t_selected, sim_g, true_bins).to_value(u.m**2)

        # --------------------------------------------
        # Energy-migration fit: ratio Ereco/Etrue
        # --------------------------------------------
        n_true_bins = len(e_true_edges) - 1
        mu_loc   = np.full(n_true_bins, np.nan)
        mu_scale = np.full(n_true_bins, np.nan)
        mu_a     = np.full(n_true_bins, np.nan)
        model    = np.array(["moyal"] * n_true_bins, dtype=object)

        for i in range(n_true_bins):
            m = g_sel & (g_true >= e_true_edges[i]) & (g_true < e_true_edges[i + 1])
            if np.count_nonzero(m) < 20:
                continue

            ratio = g_reco[m] / np.clip(g_true[m], 1e-20, None)
            ratio = np.clip(ratio, 1e-6, 1e6)

            med = float(np.nanmedian(ratio))
            mad = float(np.nanmedian(np.abs(ratio - med)))
            sigma = max(1.4826 * mad, 1e-3)
            clean = ratio[np.abs(ratio - med) < 3.0 * sigma]
            if clean.size < 20:
                clean = ratio

            # moyal fit
            try:
                loc_m, scale_m = moyal.fit(clean, loc=med, scale=sigma)
            except Exception:
                loc_m, scale_m = med, sigma

            ok_m = (
                np.isfinite(loc_m) and np.isfinite(scale_m)
                and (0.01 < scale_m < 3.0)
            )

            # skewnorm fit
            try:
                a_s, loc_s, scale_s = skewnorm.fit(clean, loc=med, scale=sigma)
                ok_s = (
                    np.isfinite(a_s) and np.isfinite(loc_s) and np.isfinite(scale_s)
                    and abs(a_s) < 30.0
                    and (0.01 < scale_s < 3.0)
                )
            except Exception:
                ok_s = False
                a_s = loc_s = scale_s = np.nan

            if ok_s and (abs(a_s) > 0.5):
                mu_loc[i], mu_scale[i], mu_a[i] = float(loc_s), float(scale_s), float(a_s)
                model[i] = "skewnorm"
            elif ok_m:
                mu_loc[i], mu_scale[i], mu_a[i] = float(loc_m), float(scale_m), np.nan
                model[i] = "moyal"
            else:
                mu_loc[i], mu_scale[i], mu_a[i] = med, sigma, np.nan
                model[i] = "moyal"

        df_gamma = pd.DataFrame({
            "ZD_deg": np.full(n_true_bins, zd, dtype=float),
            "Etrue_min_TeV": e_true_edges[:-1].astype(float),
            "Etrue_max_TeV": e_true_edges[1:].astype(float),
            "Aeff_m2": aeff_sel.astype(float),
            "emig_mu_loc": mu_loc.astype(float),
            "emig_mu_scale": mu_scale.astype(float),
            "emig_mu_a": mu_a.astype(float),
            "emig_model": model,
        })

        df_gamma.to_csv(gamma_csv, mode="a", header=not gamma_csv.exists(), index=False)

        print(f"[zd={zd:>2.0f} eff={eff:.2f}] rows: back={len(df_back)}, gamma={len(df_gamma)}")

print("Done. CSVs in:", OUT.resolve())


[zd=20 eff=0.40] rows: back=14, gamma=30
[zd=30 eff=0.40] rows: back=13, gamma=30


  return (n_selected / n_simulated) * area


[zd=40 eff=0.40] rows: back=12, gamma=30


  return (n_selected / n_simulated) * area


[zd=60 eff=0.40] rows: back=7, gamma=30
[zd=20 eff=0.70] rows: back=14, gamma=30
[zd=30 eff=0.70] rows: back=13, gamma=30


  return (n_selected / n_simulated) * area


[zd=40 eff=0.70] rows: back=12, gamma=30


  return (n_selected / n_simulated) * area


[zd=60 eff=0.70] rows: back=7, gamma=30
[zd=20 eff=0.90] rows: back=14, gamma=30


In [1]:
for i, a in enumerate(aeff_sel):
    print(i, e_true_edges[i], a)


NameError: name 'aeff_sel' is not defined