In [11]:
import sys
import os
from scipy.optimize import curve_fit

# Get the project root (one level above notebooks/)
project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Added to Python path:", project_root)


Added to Python path: c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt


In [12]:
import h5py
import numpy as np
import pandas as pd
# from src.parameters.tail_features import compute_LQ80

In [13]:
train_file = "../data/old/MJD_Train_0.hdf5"

with h5py.File(train_file, "r") as f:
    waveforms = np.array(f["raw_waveform"])
    ids = np.array(f["id"])

print("Loaded", len(waveforms), "waveforms")


Loaded 65000 waveforms


In [14]:
def print_hdf5_structure(name, obj):
    print(name)

with h5py.File(train_file, "r") as f:
    f.visititems(print_hdf5_structure)


detector
energy_label
id
psd_label_dcr
psd_label_high_avse
psd_label_low_avse
psd_label_lq
raw_waveform
run_number
tp0


In [15]:
def estimate_baseline(y, n_samples=200):
    """
    Returns baseline (mean, std) from the first n_samples.
    """
    y0 = np.asarray(y, dtype=float)[:n_samples]
    return float(np.mean(y0)), float(np.std(y0))


In [16]:
def exponential(t, a, tau1, b, tau2):
    """
    Double exponential decay model of the form:
        a * exp(-t/tau1) + b * exp(-t/tau2)

    This models the long falling edge of HPGe waveforms.
    """
    return a * np.exp(-t / tau1) + b * np.exp(-t / tau2)


In [17]:
# ------------------------------------------------------------
# Pole–Zero Correction 
# ------------------------------------------------------------
def pole_zero_correction(waveform, use_pz=False):
    """
    Applies pole–zero correction to the waveform tail.

    Parameters
    ----------
    waveform : np.ndarray
        Raw waveform.
    use_pz : bool
        If False, returns the waveform unchanged.
        If True, attempts exponential fitting and tail correction.

    Returns
    -------
    waveform_pz : np.ndarray
        Corrected waveform (or raw waveform if disabled/fitting failed).
    corrected_tail : np.ndarray
        The corrected tail region (or raw tail if not corrected).
    """

    # --------------------------------------------------------
    # If disabled → return original waveform immediately
    # --------------------------------------------------------
    y = np.asarray(waveform, dtype=float)
    if not use_pz:
        return y, y

    # --------------------------------------------------------
    # Identify 98 percent rise point (start of decay)
    # --------------------------------------------------------
    peak_value = np.max(y)
    t98_idx = np.where(y >= 0.98 * peak_value)[0]
    if len(t98_idx) == 0:
        return y, y
    t98 = int(t98_idx[0])

    # Tail region
    tail_values = y[t98:]
    tail_time = np.arange(len(tail_values))

    # --------------------------------------------------------
    # Fit a double exponential tail
    # Use tighter bounds to avoid overflow and unrealistic fits
    # --------------------------------------------------------
    try:
        params, _ = curve_fit(
            exponential,
            tail_time,
            tail_values,
            p0=[peak_value, 300.0, peak_value * 0.1, 1500.0],  # initial guesses
            bounds=(
                [0, 10, 0, 10],          # lower bounds
                [peak_value * 2, 5000, peak_value * 2, 5000]  # upper bounds
            ),
            maxfev=4000
        )

        # Decay model
        model_decay = exponential(tail_time, *params)

        # Reference point for normalization
        f_t0 = np.mean(tail_values[:5])
        f_pz = f_t0 / model_decay

        corrected_tail = tail_values * f_pz

        waveform_pz = y.copy()
        waveform_pz[t98:] = corrected_tail

        return waveform_pz, corrected_tail

    except Exception:
        # If fitting fails, return original
        return y, y



In [18]:
def compute_LQ80(waveform):
    """
    Late Charge 80:
    Area difference between raw and PZ-corrected waveform
    starting at the 80 percent rising edge.
    """
    # ---- IMPORTANT FIX ----
    waveform_pz, _ = pole_zero_correction(waveform, use_pz=True)

    y  = np.asarray(waveform, dtype=float)
    yc = np.asarray(waveform_pz, dtype=float)

    # Baseline
    baseline, _ = estimate_baseline(y)

    # Peak
    peak_val = float(np.max(y))
    target = baseline + 0.80 * (peak_val - baseline)

    # Rising-edge crossing
    idx = np.where(y >= target)[0]
    if len(idx) == 0:
        return np.nan

    i80 = int(idx[0])

    # Time index for integration
    t = np.arange(len(y), dtype=float)

    area_raw  = float(np.trapezoid(y[i80:],  t[i80:]))
    area_corr = float(np.trapezoid(yc[i80:], t[i80:]))

    return area_raw - area_corr


In [19]:
OUTPUT_DIR = "finalcsveunice"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [20]:
all_ids = []
all_LQ80 = []

for train_idx in range(16):
    train_file = f"../data/old/MJD_Train_{train_idx}.hdf5"
    if not os.path.exists(train_file):
        print(f"Skipping missing file: {train_file}")
        continue

    print(f"\nLoading {train_file}")

    with h5py.File(train_file, "r") as f:
        waveforms = np.array(f["raw_waveform"])
        ids = np.array(f["id"])

    print(f"  Waveforms: {len(waveforms)}")

    for i, wf in enumerate(waveforms):
        if i % 5000 == 0:
            print(f"    LQ80 Train_{train_idx} {i}/{len(waveforms)}")

        all_LQ80.append(compute_LQ80(wf))
        all_ids.append(f"{ids[i]}_train_{train_idx}")

all_LQ80 = np.array(all_LQ80, dtype=float)
all_LQ80[~np.isfinite(all_LQ80)] = np.nan

df_lq80 = pd.DataFrame({
    "id": all_ids,
    "LQ80": all_LQ80
})

output_path_lq80 = os.path.join(OUTPUT_DIR, "LQ80_train_all.csv")
df_lq80.to_csv(output_path_lq80, index=False)

print("\nSaved combined LQ80 CSV to:", output_path_lq80)
print(df_lq80.head())
print(df_lq80["LQ80"].describe())
print("NaNs:", df_lq80["LQ80"].isna().sum())



Loading ../data/old/MJD_Train_0.hdf5
  Waveforms: 65000
    LQ80 Train_0 0/65000
    LQ80 Train_0 5000/65000
    LQ80 Train_0 10000/65000
    LQ80 Train_0 15000/65000
    LQ80 Train_0 20000/65000
    LQ80 Train_0 25000/65000
    LQ80 Train_0 30000/65000
    LQ80 Train_0 35000/65000
    LQ80 Train_0 40000/65000
    LQ80 Train_0 45000/65000
    LQ80 Train_0 50000/65000


KeyboardInterrupt: 

In [21]:
# LQ80 Test
all_test_ids = []
all_LQ80_test = []

for test_idx in range(6):
    test_file = f"../data/old/MJD_Test_{test_idx}.hdf5"
    if not os.path.exists(test_file):
        print(f"Skipping missing file: {test_file}")
        continue

    print(f"\nLoading {test_file}")

    with h5py.File(test_file, "r") as f:
        waveforms_test = np.array(f["raw_waveform"])
        ids_test = np.array(f["id"])

    print(f"  Waveforms: {len(waveforms_test)}")

    for i, wf in enumerate(waveforms_test):
        if i % 5000 == 0:
            print(f"    LQ80 Test_{test_idx} {i}/{len(waveforms_test)}")

        all_LQ80_test.append(compute_LQ80(wf))
        all_test_ids.append(f"{ids_test[i]}_test_{test_idx}")

all_LQ80_test = np.array(all_LQ80_test, dtype=float)
all_LQ80_test[~np.isfinite(all_LQ80_test)] = np.nan

df_lq80_test = pd.DataFrame({
    "id": all_test_ids,
    "LQ80": all_LQ80_test
})

output_path_lq80_test = os.path.join(OUTPUT_DIR, "LQ80_test_all.csv")
df_lq80_test.to_csv(output_path_lq80_test, index=False)

print("\nSaved combined LQ80 TEST CSV to:", output_path_lq80_test)
print(df_lq80_test.head())
print(df_lq80_test["LQ80"].describe())
print("NaNs:", df_lq80_test["LQ80"].isna().sum())



Loading ../data/old/MJD_Test_0.hdf5
  Waveforms: 65000
    LQ80 Test_0 0/65000
    LQ80 Test_0 5000/65000
    LQ80 Test_0 10000/65000
    LQ80 Test_0 15000/65000
    LQ80 Test_0 20000/65000
    LQ80 Test_0 25000/65000
    LQ80 Test_0 30000/65000
    LQ80 Test_0 35000/65000
    LQ80 Test_0 40000/65000
    LQ80 Test_0 45000/65000
    LQ80 Test_0 50000/65000
    LQ80 Test_0 55000/65000
    LQ80 Test_0 60000/65000

Loading ../data/old/MJD_Test_1.hdf5
  Waveforms: 65000
    LQ80 Test_1 0/65000
    LQ80 Test_1 5000/65000
    LQ80 Test_1 10000/65000
    LQ80 Test_1 15000/65000
    LQ80 Test_1 20000/65000
    LQ80 Test_1 25000/65000
    LQ80 Test_1 30000/65000
    LQ80 Test_1 35000/65000
    LQ80 Test_1 40000/65000
    LQ80 Test_1 45000/65000
    LQ80 Test_1 50000/65000
    LQ80 Test_1 55000/65000
    LQ80 Test_1 60000/65000

Loading ../data/old/MJD_Test_2.hdf5
  Waveforms: 65000
    LQ80 Test_2 0/65000
    LQ80 Test_2 5000/65000
    LQ80 Test_2 10000/65000
    LQ80 Test_2 15000/65000
    LQ8

  params, _ = curve_fit(


    LQ80 Test_2 40000/65000
    LQ80 Test_2 45000/65000
    LQ80 Test_2 50000/65000
    LQ80 Test_2 55000/65000
    LQ80 Test_2 60000/65000

Loading ../data/old/MJD_Test_3.hdf5
  Waveforms: 65000
    LQ80 Test_3 0/65000
    LQ80 Test_3 5000/65000
    LQ80 Test_3 10000/65000
    LQ80 Test_3 15000/65000
    LQ80 Test_3 20000/65000
    LQ80 Test_3 25000/65000
    LQ80 Test_3 30000/65000
    LQ80 Test_3 35000/65000
    LQ80 Test_3 40000/65000
    LQ80 Test_3 45000/65000
    LQ80 Test_3 50000/65000
    LQ80 Test_3 55000/65000
    LQ80 Test_3 60000/65000

Loading ../data/old/MJD_Test_4.hdf5
  Waveforms: 65000
    LQ80 Test_4 0/65000
    LQ80 Test_4 5000/65000
    LQ80 Test_4 10000/65000
    LQ80 Test_4 15000/65000
    LQ80 Test_4 20000/65000
    LQ80 Test_4 25000/65000
    LQ80 Test_4 30000/65000
    LQ80 Test_4 35000/65000
    LQ80 Test_4 40000/65000
    LQ80 Test_4 45000/65000
    LQ80 Test_4 50000/65000
    LQ80 Test_4 55000/65000
    LQ80 Test_4 60000/65000

Loading ../data/old/MJD_Test_

KeyboardInterrupt: 