In [9]:
# src/experiments/extract_my_params.py

import os
import glob
import h5py
import numpy as np
import pandas as pd


# ---------------------------------------------------------------------------
#  Parameter extraction helpers
#  (If you already implemented these in src/parameters, you can replace these
#   definitions with imports from those modules.)
# ---------------------------------------------------------------------------

def compute_tdrift99_single(wf, tp0, frac=0.999):
    """
    TDrift99 in *samples*: from tp0 to first time the waveform reaches
    `frac` of its maximum after tp0.
    """
    n = len(wf)
    start = int(tp0)
    if start >= n - 1:
        return np.nan

    segment = wf[start:]
    peak_val = segment.max()
    if peak_val <= 0:
        return np.nan

    threshold = frac * peak_val
    above = np.where(segment >= threshold)[0]
    if len(above) == 0:
        return np.nan

    return float(above[0])  # in samples


def pole_zero_correct(wf, tau_samples=500.0):
    """
    Simple pole–zero correction (high-pass IIR). This doesn't have to be
    perfect — it just needs to flatten tails reasonably for TFR.
    """
    alpha = np.exp(-1.0 / tau_samples)
    y = np.zeros_like(wf, dtype=np.float64)
    prev_x = wf[0]
    prev_y = 0.0
    for i in range(1, len(wf)):
        x = float(wf[i])
        y_i = x - prev_x + alpha * prev_y
        y[i] = y_i
        prev_x = x
        prev_y = y_i
    return y


def compute_tfr_single(wf, tp0, tail_offset=200, tail_len=600):
    """
    Tail Flattening Ratio (TFR) = std(tail_raw) / std(tail_pz)
    tail is taken starting at tp0 + tail_offset, for length tail_len (or to end).
    """
    n = len(wf)
    start = int(tp0) + tail_offset
    if start >= n - 10:  # not enough tail
        return np.nan

    end = min(n, start + tail_len)
    tail_raw = wf[start:end].astype(np.float64)

    wf_pz = pole_zero_correct(wf)
    tail_pz = wf_pz[start:end]

    std_raw = np.std(tail_raw)
    std_pz = np.std(tail_pz)

    if std_pz <= 0:
        return np.nan
    return float(std_raw / std_pz)


def smooth_gaussian(x, sigma=2.0):
    """Simple 1D Gaussian smoothing using convolution."""
    if sigma <= 0:
        return x.astype(np.float64)
    radius = int(3 * sigma)
    idx = np.arange(-radius, radius + 1, dtype=np.float64)
    kernel = np.exp(-0.5 * (idx / sigma) ** 2)
    kernel /= kernel.sum()
    padded = np.pad(x, radius, mode="edge")
    conv = np.convolve(padded, kernel, mode="same")
    return conv[radius:-radius]


def compute_peak_count_single(
    wf,
    tp0,
    window_after_tp0=400,
    grad_threshold_frac=0.05,
    min_separation=5,
):
    """
    Peak Count on the gradient:
    - baseline-subtract and normalize waveform
    - smooth
    - compute gradient
    - count local maxima above grad_threshold
    """
    n = len(wf)
    tp0 = int(tp0)

    # Baseline: first 200 samples (guard against short)
    base_end = min(200, n)
    baseline = float(np.mean(wf[:base_end]))
    wf_bs = wf - baseline

    # Normalize by global max to make threshold comparable across events
    max_val = np.max(np.abs(wf_bs))
    if max_val <= 0:
        return 0
    wf_norm = wf_bs / max_val

    # Only look near and after tp0
    start = max(tp0 - 10, 0)
    end = min(tp0 + window_after_tp0, n)
    segment = wf_norm[start:end]

    # Smooth and gradient
    seg_smooth = smooth_gaussian(segment, sigma=2.0)
    grad = np.gradient(seg_smooth)

    # Threshold relative to max gradient
    gmax = np.max(np.abs(grad))
    if gmax <= 0:
        return 0
    threshold = grad_threshold_frac * gmax

    # Count local maxima above threshold with minimum separation
    count = 0
    last_peak_idx = -min_separation - 1
    for i in range(1, len(grad) - 1):
        if grad[i] > grad[i - 1] and grad[i] > grad[i + 1] and grad[i] >= threshold:
            if i - last_peak_idx >= min_separation:
                count += 1
                last_peak_idx = i

    return int(count)


def compute_gradient_baseline_noise_single(wf, baseline_region=(0, 200)):
    """
    Gradient Baseline Noise = RMS of gradient in a pre-rise baseline window.
    """
    start, end = baseline_region
    start = max(start, 0)
    end = min(end, len(wf))
    if end - start < 5:
        return np.nan

    segment = wf[start:end].astype(np.float64)
    grad = np.gradient(segment)
    return float(np.sqrt(np.mean(grad ** 2)))


def compute_band_power_ratio_single(
    wf,
    fs=100e6,
    low_band=(0.1e6, 1e6),
    high_band=(1e6, 10e6),
):
    """
    Band Power Ratio (BPR) = power_high / power_low using FFT of the waveform.
    """
    x = wf.astype(np.float64)
    x = x - np.mean(x)

    # Real FFT
    fft_vals = np.fft.rfft(x)
    psd = np.abs(fft_vals) ** 2
    freqs = np.fft.rfftfreq(len(x), d=1.0 / fs)

    low_mask = (freqs >= low_band[0]) & (freqs < low_band[1])
    high_mask = (freqs >= high_band[0]) & (freqs < high_band[1])

    power_low = psd[low_mask].sum()
    power_high = psd[high_mask].sum()

    if power_low <= 0:
        return np.nan
    return float(power_high / power_low)


# ---------------------------------------------------------------------------
#  Per-file processing
# ---------------------------------------------------------------------------

def process_hdf5_file(h5_path, out_dir):
    """
    Read one HDF5 file, compute all 5 parameters, and write a CSV with:
      id, file, tdrift99, tfr, peak_count, gbn, bpr
    """
    print(f"Processing {h5_path}...")
    basename = os.path.basename(h5_path)

    with h5py.File(h5_path, "r") as f:
        waveforms = f["raw_waveform"][:]          # shape: (N, 3800)
        tp0 = f["tp0"][:]                         # shape: (N,)
        ids = f["id"][:]                          # shape: (N,)

    n_events = waveforms.shape[0]
    print(f"  Found {n_events} waveforms.")

    tdrift_list = []
    tfr_list = []
    peak_count_list = []
    gbn_list = []
    bpr_list = []

    for i in range(n_events):
        wf = waveforms[i]

        t0 = tp0[i]
        tdrift_list.append(compute_tdrift99_single(wf, t0))
        tfr_list.append(compute_tfr_single(wf, t0))
        peak_count_list.append(compute_peak_count_single(wf, t0))
        gbn_list.append(compute_gradient_baseline_noise_single(wf))
        bpr_list.append(compute_band_power_ratio_single(wf))

        if (i + 1) % 5000 == 0:
            print(f"    Processed {i + 1}/{n_events} events...")

    df = pd.DataFrame(
        {
            "id": ids,
            "file": basename,
            "tdrift99": tdrift_list,
            "tfr": tfr_list,
            "peak_count": peak_count_list,
            "gbn": gbn_list,
            "bpr": bpr_list,
        }
    )

    os.makedirs(out_dir, exist_ok=True)
    out_name = os.path.splitext(basename)[0] + "_myparams.csv"
    out_path = os.path.join(out_dir, out_name)
    df.to_csv(out_path, index=False)
    print(f"  Saved CSV to {out_path}\n")


# ---------------------------------------------------------------------------
#  Main: loop over all train/test files
# ---------------------------------------------------------------------------

def main():
    # Notebook-safe data directory
    DATA_DIR = os.path.abspath("data")

    TRAIN_PATTERN = os.path.join(DATA_DIR, "MJD_Train*.hdf5")
    TEST_PATTERN = os.path.join(DATA_DIR, "MJD_Test*.hdf5")

    OUT_DIR_TRAIN = os.path.join(DATA_DIR, "params_train")
    OUT_DIR_TEST = os.path.join(DATA_DIR, "params_test")

    train_files = sorted(glob.glob(TRAIN_PATTERN))
    test_files = sorted(glob.glob(TEST_PATTERN))

    print("Train files:", train_files)
    print("Test files:", test_files)

    for path in train_files:
        process_hdf5_file(path, OUT_DIR_TRAIN)

    for path in test_files:
        process_hdf5_file(path, OUT_DIR_TEST)
    process_hdf5_file("../../data/MJD_Train_15.hdf5", "../../data/params_train")



if __name__ == "__main__":
    main()


Train files: []
Test files: []
Processing ../../data/MJD_Train_15.hdf5...
  Found 65000 waveforms.
    Processed 5000/65000 events...
    Processed 10000/65000 events...
    Processed 15000/65000 events...
    Processed 20000/65000 events...
    Processed 25000/65000 events...
    Processed 30000/65000 events...
    Processed 35000/65000 events...
    Processed 40000/65000 events...
    Processed 45000/65000 events...
    Processed 50000/65000 events...
    Processed 55000/65000 events...
    Processed 60000/65000 events...
    Processed 65000/65000 events...
  Saved CSV to ../../data/params_train/MJD_Train_15_myparams.csv



In [8]:
# src/experiments/extract_my_params.py
import os, glob, h5py
import numpy as np
import pandas as pd

def compute_tdrift_single(wf, tp0, frac):
    n = len(wf)
    start = int(tp0)
    if start >= n - 1:
        return np.nan
    seg = wf[start:]
    peak = seg.max()
    if peak <= 0:
        return np.nan
    thr = frac * peak
    idx = np.where(seg >= thr)[0]
    return float(idx[0]) if len(idx) else np.nan  # samples after tp0

def pole_zero_correct(wf, tau_samples=500.0):
    a = np.exp(-1.0 / tau_samples)
    y = np.zeros_like(wf, dtype=np.float64)
    px, py = float(wf[0]), 0.0
    for i in range(1, len(wf)):
        x = float(wf[i])
        py = x - px + a * py
        y[i] = py
        px = x
    return y

def compute_tfr_single(wf, tp0, tail_offset=200, tail_len=600):
    n = len(wf)
    start = int(tp0) + tail_offset
    if start >= n - 10:
        return np.nan
    end = min(n, start + tail_len)
    tail_raw = wf[start:end].astype(np.float64)
    tail_pz = pole_zero_correct(wf)[start:end]
    sr, sp = np.std(tail_raw), np.std(tail_pz)
    return np.nan if sp <= 0 else float(sr / sp)

def smooth_gaussian(x, sigma=2.0):
    if sigma <= 0:
        return x.astype(np.float64)
    r = int(3 * sigma)
    kx = np.arange(-r, r + 1, dtype=np.float64)
    k = np.exp(-0.5 * (kx / sigma) ** 2); k /= k.sum()
    p = np.pad(x, r, mode="edge")
    return np.convolve(p, k, mode="same")[r:-r]

def compute_peak_count_single(wf, tp0, window_after_tp0=400, grad_thr_frac=0.05, min_sep=5):
    n = len(wf); tp0 = int(tp0)
    b = float(np.mean(wf[:min(200, n)]))
    x = wf - b
    m = np.max(np.abs(x))
    if m <= 0: return 0
    x = x / m
    s, e = max(tp0 - 10, 0), min(tp0 + window_after_tp0, n)
    seg = smooth_gaussian(x[s:e], sigma=2.0)
    g = np.gradient(seg)
    gmax = np.max(np.abs(g))
    if gmax <= 0: return 0
    thr = grad_thr_frac * gmax
    count, last = 0, -min_sep - 1
    for i in range(1, len(g) - 1):
        if g[i] > g[i-1] and g[i] > g[i+1] and g[i] >= thr and (i - last) >= min_sep:
            count += 1; last = i
    return int(count)

def compute_gbn_single(wf, baseline=(0, 200)):
    s, e = max(baseline[0], 0), min(baseline[1], len(wf))
    if e - s < 5: return np.nan
    g = np.gradient(wf[s:e].astype(np.float64))
    return float(np.sqrt(np.mean(g ** 2)))

def compute_bpr_single(wf, fs=100e6, low=(0.1e6, 1e6), high=(1e6, 10e6)):
    x = wf.astype(np.float64) - np.mean(wf)
    F = np.fft.rfft(x)
    psd = np.abs(F) ** 2
    f = np.fft.rfftfreq(len(x), d=1.0 / fs)
    pl = psd[(f >= low[0]) & (f < low[1])].sum()
    ph = psd[(f >= high[0]) & (f < high[1])].sum()
    return np.nan if pl <= 0 else float(ph / pl)

def process_hdf5_file(h5_path, out_dir):
    basename = os.path.basename(h5_path)
    print("Processing", basename)

    with h5py.File(h5_path, "r") as f:
        wfs = f["raw_waveform"][:]
        tp0 = f["tp0"][:]
        ids = f["id"][:]

    t10, t50, t99, tfr, pc, gbn, bpr = ([] for _ in range(7))

    for i in range(len(wfs)):
        wf = wfs[i]; t0 = tp0[i]
        t10.append(compute_tdrift_single(wf, t0, 0.10))
        t50.append(compute_tdrift_single(wf, t0, 0.50))
        t99.append(compute_tdrift_single(wf, t0, 0.999))
        tfr.append(compute_tfr_single(wf, t0))
        pc.append(compute_peak_count_single(wf, t0))
        gbn.append(compute_gbn_single(wf))
        bpr.append(compute_bpr_single(wf))

    df = pd.DataFrame({
        "id": ids,
        "file": basename,
        "tdrift10": t10,
        "tdrift50": t50,
        "tdrift99": t99,
        "tfr": tfr,
        "peak_count": pc,
        "gbn": gbn,
        "bpr": bpr,
    })

    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, os.path.splitext(basename)[0] + "_myparams.csv")
    df.to_csv(out_path, index=False)
    print("Saved", out_path, "\n")

def main():
    REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../.."))
    DATA_DIR = os.path.join(REPO_ROOT, "data")

    train_files = sorted(glob.glob(os.path.join(DATA_DIR, "MJD_Train*.hdf5")))
    test_files  = sorted(glob.glob(os.path.join(DATA_DIR, "MJD_Test*.hdf5")))

    out_train = os.path.join(DATA_DIR, "params_train")
    out_test  = os.path.join(DATA_DIR, "params_test")

    for p in train_files: process_hdf5_file(p, out_train)
    for p in test_files:  process_hdf5_file(p, out_test)


if __name__ == "__main__":
    main()


Processing MJD_Train_0.hdf5
Saved /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_train/MJD_Train_0_myparams.csv 

Processing MJD_Train_1.hdf5
Saved /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_train/MJD_Train_1_myparams.csv 

Processing MJD_Train_10.hdf5
Saved /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_train/MJD_Train_10_myparams.csv 

Processing MJD_Train_11.hdf5
Saved /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_train/MJD_Train_11_myparams.csv 

Processing MJD_Train_12.hdf5
Saved /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_train/MJD_Train_12_myparams.csv 

Processing MJD_Train_13.hdf5
Saved /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_train/MJD_Train_13_myparams.csv 

Processing MJD_Train_14.hdf5
Saved /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_train/MJD_Train_14_myparams.csv 

Pro

TimeoutError: [Errno 60] Unable to synchronously open file (file read failed: time = Fri Jan 23 17:10:14 2026
, filename = '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Test_1.hdf5', file descriptor = 73, errno = 60, error message = 'Operation timed out', buf = 0x16d6bc760, total read size = 8, bytes this sub-read = 8, offset = 0)

In [None]:
import os
print("CWD =", os.getcwd())


CWD = /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/src/experiments


In [None]:
DATA_DIR = os.path.abspath("data")


In [6]:
import os
print("CWD =", os.getcwd())
print("data exists?", os.path.exists("data"))
print("files in data:", os.listdir("data"))


CWD = /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/src/experiments
data exists? False


FileNotFoundError: [Errno 2] No such file or directory: 'data'

In [3]:
from pathlib import Path
DATA_DIR = (Path.cwd() / "data").resolve()


In [5]:
import glob, os
print("TRAIN_PATTERN =", TRAIN_PATTERN)
print("Matches:", glob.glob(TRAIN_PATTERN))


NameError: name 'TRAIN_PATTERN' is not defined

In [7]:
from pathlib import Path
import glob

DATA_DIR = (Path.cwd() / "data").resolve()   # assumes you opened VSCode at repo root
TRAIN_PATTERN = str(DATA_DIR / "MJD_Train*.hdf5")
TEST_PATTERN  = str(DATA_DIR / "MJD_Test*.hdf5")

train_files = sorted(glob.glob(TRAIN_PATTERN))
test_files  = sorted(glob.glob(TEST_PATTERN))

print("CWD:", Path.cwd())
print("DATA_DIR:", DATA_DIR)
print("Train files found:", train_files)
print("Test files found:", test_files)


CWD: /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/src/experiments
DATA_DIR: /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/src/experiments/data
Train files found: []
Test files found: []


In [9]:
import h5py
p = "data/MJD_Test_1.hdf5"
with h5py.File(p, "r") as f:
    print(list(f.keys()))

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'data/MJD_Test_1.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [10]:
import h5py
p = "../../data/MJD_Test_1.hdf5"
with h5py.File(p, "r") as f:
    print(list(f.keys()))


['detector', 'energy_label', 'id', 'psd_label_dcr', 'psd_label_high_avse', 'psd_label_low_avse', 'psd_label_lq', 'raw_waveform', 'run_number', 'tp0']


In [13]:
# src/experiments/extract_my_params.py

import os
import glob
import h5py
import numpy as np
import pandas as pd


# ---------------------------------------------------------------------------
#  Parameter extraction helpers
#  (If you already implemented these in src/parameters, you can replace these
#   definitions with imports from those modules.)
# ---------------------------------------------------------------------------

def compute_tdrift99_single(wf, tp0, frac=0.999):
    """
    TDrift99 in *samples*: from tp0 to first time the waveform reaches
    `frac` of its maximum after tp0.
    """
    n = len(wf)
    start = int(tp0)
    if start >= n - 1:
        return np.nan

    segment = wf[start:]
    peak_val = segment.max()
    if peak_val <= 0:
        return np.nan

    threshold = frac * peak_val
    above = np.where(segment >= threshold)[0]
    if len(above) == 0:
        return np.nan

    return float(above[0])  # in samples


def pole_zero_correct(wf, tau_samples=500.0):
    """
    Simple pole–zero correction (high-pass IIR). This doesn't have to be
    perfect — it just needs to flatten tails reasonably for TFR.
    """
    alpha = np.exp(-1.0 / tau_samples)
    y = np.zeros_like(wf, dtype=np.float64)
    prev_x = wf[0]
    prev_y = 0.0
    for i in range(1, len(wf)):
        x = float(wf[i])
        y_i = x - prev_x + alpha * prev_y
        y[i] = y_i
        prev_x = x
        prev_y = y_i
    return y


def compute_tfr_single(wf, tp0, tail_offset=200, tail_len=600):
    """
    Tail Flattening Ratio (TFR) = std(tail_raw) / std(tail_pz)
    tail is taken starting at tp0 + tail_offset, for length tail_len (or to end).
    """
    n = len(wf)
    start = int(tp0) + tail_offset
    if start >= n - 10:  # not enough tail
        return np.nan

    end = min(n, start + tail_len)
    tail_raw = wf[start:end].astype(np.float64)

    wf_pz = pole_zero_correct(wf)
    tail_pz = wf_pz[start:end]

    std_raw = np.std(tail_raw)
    std_pz = np.std(tail_pz)

    if std_pz <= 0:
        return np.nan
    return float(std_raw / std_pz)


def smooth_gaussian(x, sigma=2.0):
    """Simple 1D Gaussian smoothing using convolution."""
    if sigma <= 0:
        return x.astype(np.float64)
    radius = int(3 * sigma)
    idx = np.arange(-radius, radius + 1, dtype=np.float64)
    kernel = np.exp(-0.5 * (idx / sigma) ** 2)
    kernel /= kernel.sum()
    padded = np.pad(x, radius, mode="edge")
    conv = np.convolve(padded, kernel, mode="same")
    return conv[radius:-radius]


def compute_peak_count_single(
    wf,
    tp0,
    window_after_tp0=400,
    grad_threshold_frac=0.05,
    min_separation=5,
):
    """
    Peak Count on the gradient:
    - baseline-subtract and normalize waveform
    - smooth
    - compute gradient
    - count local maxima above grad_threshold
    """
    n = len(wf)
    tp0 = int(tp0)

    # Baseline: first 200 samples (guard against short)
    base_end = min(200, n)
    baseline = float(np.mean(wf[:base_end]))
    wf_bs = wf - baseline

    # Normalize by global max to make threshold comparable across events
    max_val = np.max(np.abs(wf_bs))
    if max_val <= 0:
        return 0
    wf_norm = wf_bs / max_val

    # Only look near and after tp0
    start = max(tp0 - 10, 0)
    end = min(tp0 + window_after_tp0, n)
    segment = wf_norm[start:end]

    # Smooth and gradient
    seg_smooth = smooth_gaussian(segment, sigma=2.0)
    grad = np.gradient(seg_smooth)

    # Threshold relative to max gradient
    gmax = np.max(np.abs(grad))
    if gmax <= 0:
        return 0
    threshold = grad_threshold_frac * gmax

    # Count local maxima above threshold with minimum separation
    count = 0
    last_peak_idx = -min_separation - 1
    for i in range(1, len(grad) - 1):
        if grad[i] > grad[i - 1] and grad[i] > grad[i + 1] and grad[i] >= threshold:
            if i - last_peak_idx >= min_separation:
                count += 1
                last_peak_idx = i

    return int(count)


def compute_gradient_baseline_noise_single(wf, baseline_region=(0, 200)):
    """
    Gradient Baseline Noise = RMS of gradient in a pre-rise baseline window.
    """
    start, end = baseline_region
    start = max(start, 0)
    end = min(end, len(wf))
    if end - start < 5:
        return np.nan

    segment = wf[start:end].astype(np.float64)
    grad = np.gradient(segment)
    return float(np.sqrt(np.mean(grad ** 2)))


def compute_band_power_ratio_single(
    wf,
    fs=100e6,
    low_band=(0.1e6, 1e6),
    high_band=(1e6, 10e6),
):
    """
    Band Power Ratio (BPR) = power_high / power_low using FFT of the waveform.
    """
    x = wf.astype(np.float64)
    x = x - np.mean(x)

    # Real FFT
    fft_vals = np.fft.rfft(x)
    psd = np.abs(fft_vals) ** 2
    freqs = np.fft.rfftfreq(len(x), d=1.0 / fs)

    low_mask = (freqs >= low_band[0]) & (freqs < low_band[1])
    high_mask = (freqs >= high_band[0]) & (freqs < high_band[1])

    power_low = psd[low_mask].sum()
    power_high = psd[high_mask].sum()

    if power_low <= 0:
        return np.nan
    return float(power_high / power_low)


# ---------------------------------------------------------------------------
#  Per-file processing
# ---------------------------------------------------------------------------

def process_hdf5_file(h5_path, out_dir):
    """
    Read one HDF5 file, compute all parameters, and write a CSV with:
      id, file, tdrift10, tdrift50, tdrift99, tfr, peak_count, gbn, bpr
    """
    print(f"Processing {h5_path}...")
    basename = os.path.basename(h5_path)

    with h5py.File(h5_path, "r") as f:
        waveforms = f["raw_waveform"][:]          # shape: (N, 3800)
        tp0 = f["tp0"][:]                         # shape: (N,)
        ids = f["id"][:]                          # shape: (N,)

    n_events = waveforms.shape[0]
    print(f"  Found {n_events} waveforms.")

    # --- ADDED: tdrift10 + tdrift50 lists (original lists preserved) ---
    tdrift10_list = []
    tdrift50_list = []

    tdrift_list = []
    tfr_list = []
    peak_count_list = []
    gbn_list = []
    bpr_list = []

    for i in range(n_events):
        wf = waveforms[i]
        t0 = tp0[i]

        # --- ADDED: compute tdrift10 + tdrift50 using your existing function ---
        tdrift10_list.append(compute_tdrift99_single(wf, t0, frac=0.10))
        tdrift50_list.append(compute_tdrift99_single(wf, t0, frac=0.50))

        # existing tdrift99 behavior preserved
        tdrift_list.append(compute_tdrift99_single(wf, t0))

        tfr_list.append(compute_tfr_single(wf, t0))
        peak_count_list.append(compute_peak_count_single(wf, t0))
        gbn_list.append(compute_gradient_baseline_noise_single(wf))
        bpr_list.append(compute_band_power_ratio_single(wf))

        if (i + 1) % 5000 == 0:
            print(f"    Processed {i + 1}/{n_events} events...")

    # --- ADDED: columns tdrift10 + tdrift50 (everything else unchanged) ---
    df = pd.DataFrame(
        {
            "id": ids,
            "file": basename,
            "tdrift10": tdrift10_list,
            "tdrift50": tdrift50_list,
            "tdrift99": tdrift_list,
            "tfr": tfr_list,
            "peak_count": peak_count_list,
            "gbn": gbn_list,
            "bpr": bpr_list,
        }
    )

    os.makedirs(out_dir, exist_ok=True)
    out_name = os.path.splitext(basename)[0] + "_myparams.csv"
    out_path = os.path.join(out_dir, out_name)
    df.to_csv(out_path, index=False)
    print(f"  Saved CSV to {out_path}\n")


# ---------------------------------------------------------------------------
#  Main: loop over all train/test files
# ---------------------------------------------------------------------------

def main():
    # Notebook-safe data directory
    DATA_DIR = os.path.abspath("../../data")

    TRAIN_PATTERN = os.path.join(DATA_DIR, "MJD_Train*.hdf5")
    TEST_PATTERN = os.path.join(DATA_DIR, "MJD_Test*.hdf5")

    OUT_DIR_TRAIN = os.path.join(DATA_DIR, "params_train")
    OUT_DIR_TEST = os.path.join(DATA_DIR, "params_test")

    train_files = sorted(glob.glob(TRAIN_PATTERN))
    test_files = sorted(glob.glob(TEST_PATTERN))

    print("Train files:", train_files)
    print("Test files:", test_files)

    for path in train_files:
        process_hdf5_file(path, OUT_DIR_TRAIN)

    for path in test_files:
        process_hdf5_file(path, OUT_DIR_TEST)

    process_hdf5_file("../../data/MJD_Train_15.hdf5", "../../data/params_train")


if __name__ == "__main__":
    main()


Train files: ['/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_0.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_1.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_10.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_11.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_12.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_13.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_14.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_15.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_2.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_3.hdf5', '/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/MJD_Train_4.hdf5', '/Users/prithvikochhar/

In [12]:
import os
print("CWD =", os.getcwd())

CWD = /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/src/experiments


In [14]:
from pathlib import Path
import pandas as pd
import re

TRAIN_DIR = Path("../../data/params_train")
TEST_DIR  = Path("../../data/params_test")

OUT_DIR = Path("../../extracted_features_csv_files/prithvi_csv_files")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def file_no_from_name(p: Path) -> int:
    m = re.search(r"_(\d+)", p.stem)
    return int(m.group(1)) if m else 0

def merge_split(split_dir: Path, split_label: str, out_path: Path):
    files = sorted(split_dir.glob("*.csv"), key=file_no_from_name)
    if not files:
        raise FileNotFoundError(f"No CSV files found in {split_dir.resolve()}")

    frames = []
    for f in files:
        fileno = file_no_from_name(f)
        df = pd.read_csv(f)

        if "id" not in df.columns:
            raise ValueError(f"{f.name} missing 'id' column")

        df["id"] = df["id"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
        df["id"] = df["id"] + f"_{split_label}_{fileno}"

        for col in ["file", "filename"]:
            if col in df.columns:
                df = df.drop(columns=[col])

        frames.append(df)

    pd.concat(frames, ignore_index=True).to_csv(out_path, index=False)
    print("Saved:", out_path)

merge_split(TRAIN_DIR, "train", OUT_DIR / "train_2.csv")
merge_split(TEST_DIR,  "test",  OUT_DIR / "test_2.csv")


TimeoutError: [Errno 60] Operation timed out

In [15]:
from pathlib import Path
import pandas as pd
import re

# --- CHANGE THESE PATHS TO WHERE YOUR PER-FILE CSVs ACTUALLY LIVE ---
TRAIN_DIR = Path("../../data/params_train")
TEST_DIR  = Path("../../data/params_test")

OUT_DIR = Path("../../extracted_features_csv_files/prithvi_csv_files")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def file_no_from_name(path: Path) -> int:
    # works for names like: MJD_Train_0_myparams.csv OR Train_0.csv OR *_0_*.csv
    m = re.search(r"_(\d+)", path.stem)
    return int(m.group(1)) if m else 0

def merge_split(split_dir: Path, split_label: str, out_path: Path):
    files = sorted(split_dir.glob("*.csv"), key=file_no_from_name)
    if not files:
        raise FileNotFoundError(f"No CSV files found in {split_dir.resolve()}")

    frames = []

    for f in files:
        fileno = file_no_from_name(f)
        df = pd.read_csv(f)

        # --- Build new id using existing df['id'] ---
        if "id" not in df.columns:
            raise ValueError(
                f"File {f.name} is missing an 'id' column. "
                "If some files don't have id, tell me and I'll add a fallback."
            )

        # Make sure id is clean + string
        original_id = df["id"].astype(str).str.strip()

        # Optional: if some ids look like '123.0' because of CSV typing, normalize them
        # (keeps '123' instead of '123.0' when it's really an integer)
        original_id = original_id.str.replace(r"\.0$", "", regex=True)

        # New id format: <original_id>_<train|test>_<fileno>
        df["id"] = original_id + f"_{split_label}_{fileno}"

        # Remove other per-file identifiers if they exist (but keep the new 'id')
        for col in ["file", "filename"]:
            if col in df.columns:
                df = df.drop(columns=[col])

        frames.append(df)

    out_df = pd.concat(frames, ignore_index=True)
    out_df.to_csv(out_path, index=False)
    print(f"Saved {split_label} -> {out_path} | rows={len(out_df)} cols={len(out_df.columns)}")

merge_split(TRAIN_DIR, "train", OUT_DIR / "train_2.csv")
merge_split(TEST_DIR,  "test",  OUT_DIR / "test_2.csv")


TimeoutError: [Errno 60] Operation timed out

In [17]:
from pathlib import Path
import pandas as pd
import re

TRAIN_DIR = Path("../../data/params_train")
TEST_DIR  = Path("../../data/params_test")
OUT_DIR   = Path("../../extracted_features_csv_files/prithvi_csv_files")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def file_no_from_name(p: Path) -> int:
    m = re.search(r"_(\d+)", p.stem)  # grabs _0, _1, etc.
    return int(m.group(1)) if m else 0

def merge_split_chunked(split_dir: Path, split_label: str, out_path: Path, chunksize: int = 50_000):
    files = sorted(split_dir.glob("*.csv"), key=file_no_from_name)
    if not files:
        raise FileNotFoundError(f"No CSV files found in {split_dir.resolve()}")

    first_write = True
    for f in files:
        fileno = file_no_from_name(f)

        for chunk in pd.read_csv(f, chunksize=chunksize):
            if "id" not in chunk.columns:
                raise ValueError(f"{f.name} missing 'id' column")

            # build id: <original_id>_<train|test>_<fileno>
            chunk["id"] = (
                chunk["id"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
                + f"_{split_label}_{fileno}"
            )

            # drop per-file identifiers if present
            for col in ["file", "filename"]:
                if col in chunk.columns:
                    chunk = chunk.drop(columns=[col])

            chunk.to_csv(out_path, index=False, mode="w" if first_write else "a", header=first_write)
            first_write = False

    print("Saved:", out_path)

merge_split_chunked(TRAIN_DIR, "train", OUT_DIR / "train_2.csv")
merge_split_chunked(TEST_DIR,  "test",  OUT_DIR / "test_2.csv")


Saved: ../../extracted_features_csv_files/prithvi_csv_files/train_2.csv


TimeoutError: [Errno 60] Operation timed out

In [18]:
from pathlib import Path
import pandas as pd
import re

TEST_DIR = Path("../../data/params_test")

def file_no_from_name(p: Path) -> int:
    m = re.search(r"_(\d+)", p.stem)
    return int(m.group(1)) if m else 0

files = sorted(TEST_DIR.glob("*.csv"), key=file_no_from_name)

for f in files:
    try:
        pd.read_csv(f, nrows=5)
        print("OK:", f.name)
    except Exception as e:
        print("FAIL:", f.name, "|", type(e).__name__, e)
        break


OK: MJD_Test_0_myparams.csv
OK: MJD_Test_1_myparams.csv
OK: MJD_Test_2_myparams.csv
OK: MJD_Test_3_myparams.csv
OK: MJD_Test_4_myparams.csv
OK: MJD_Test_5_myparams.csv


In [19]:
from pathlib import Path
import shutil

SRC = Path("../../data/params_test")
DST = Path("../../data/params_test_local")
DST.mkdir(parents=True, exist_ok=True)

for f in SRC.glob("*.csv"):
    shutil.copy2(f, DST / f.name)

print("Copied to:", DST.resolve())


Copied to: /Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/data/params_test_local


In [20]:
from pathlib import Path
import pandas as pd
import re

TEST_DIR = Path("../../data/params_test_local")
OUT_DIR  = Path("../../extracted_features_csv_files/prithvi_csv_files")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def file_no_from_name(p: Path) -> int:
    m = re.search(r"_(\d+)", p.stem)
    return int(m.group(1)) if m else 0

out_path = OUT_DIR / "test_2.csv"
first = True

for f in sorted(TEST_DIR.glob("*.csv"), key=file_no_from_name):
    fileno = file_no_from_name(f)
    for chunk in pd.read_csv(f, chunksize=20_000):  # smaller = safer
        chunk["id"] = chunk["id"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True) + f"_test_{fileno}"
        for col in ["file", "filename"]:
            if col in chunk.columns:
                chunk = chunk.drop(columns=[col])
        chunk.to_csv(out_path, index=False, mode="w" if first else "a", header=first)
        first = False

print("Saved:", out_path)


Saved: ../../extracted_features_csv_files/prithvi_csv_files/test_2.csv
