In [1]:
import sys
import os
from scipy.optimize import curve_fit

# Get the project root (one level above notebooks/)
project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Added to Python path:", project_root)

import h5py
import numpy as np
import pandas as pd
# from src.parameters.tail_features import compute_LQ80


Added to Python path: c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt


In [2]:
# train_file = "../data/old/MJD_Train_0.hdf5"

# with h5py.File(train_file, "r") as f:
#     waveforms = np.array(f["raw_waveform"])
#     ids = np.array(f["id"])

# print("Loaded", len(waveforms), "waveforms")

In [3]:
def compute_PPR(waveform, n_plateau=300):
    """
    Peak Plateau Ratio (PPR)

    Measures how much the waveform "flattens" toward the end of the trace.
    Defined as:

        PPR = (mean of last n_plateau samples) / (peak height)

    Parameters
    ----------
    waveform : array-like
        The raw waveform.
    n_plateau : int
        Number of samples to use at the end for averaging the plateau.

    Returns
    -------
    float
        The Peak Plateau Ratio. NaN if peak is zero.
    """
    y = np.asarray(waveform, dtype=float)

    peak_val = float(np.max(y))
    if peak_val <= 0:
        return np.nan  # avoid division by zero or negative peak

    # Average of last N samples (plateau region)
    plateau = float(np.mean(y[-n_plateau:]))

    return plateau / peak_val

In [4]:
# # compute PPR for everything
# PPR_values = []

# # for i, wf in enumerate(waveforms):
# #     val = compute_PPR(wf)
# #     PPR_values.append(val)
# for i, wf in enumerate(waveforms):
#     if i % 5000 == 0:
#         print(f"Processing {i} / {len(waveforms)}")
#     PPR_values.append(compute_PPR(wf))


# PPR_values = np.array(PPR_values, dtype=float)
# PPR_values[~np.isfinite(PPR_values)] = np.nan

In [5]:
# # output
# formatted_ids = [f"{id_}_train_0" for id_ in ids]

# df = pd.DataFrame({
#     "id": formatted_ids,
#     "PPR": PPR_values
# })

In [6]:
# output_path = "PPR_train_0.csv"
# df.to_csv(output_path, index=False)
# print("Saved to", output_path)
# print(df.head())

In [7]:
# print(df["PPR"].describe())
# print("NaNs:", df["PPR"].isna().sum())

In [8]:
# import matplotlib.pyplot as plt
# import numpy as np

# vals = df["PPR"].to_numpy(dtype=float)
# vals = vals[np.isfinite(vals)]

# plt.figure()
# plt.hist(vals, bins=200)
# plt.yscale("log")
# plt.xlabel("PPR")
# plt.ylabel("Count")
# plt.title("PPR Distribution (Train 0)")
# plt.show()


In [9]:
OUTPUT_DIR = "finalcsveunice"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [10]:

all_ids = []
all_PPR = []

for train_idx in range(16):
    train_file = f"../data/old/MJD_Train_{train_idx}.hdf5"
    if not os.path.exists(train_file):
        print(f"Skipping missing file: {train_file}")
        continue

    with h5py.File(train_file, "r") as f:
        waveforms = np.array(f["raw_waveform"])
        ids = np.array(f["id"])

    print(f"Loaded Train_{train_idx}: {len(waveforms)} waveforms")

    for i, wf in enumerate(waveforms):
        if i % 5000 == 0:
            print(f"  Train_{train_idx} processing {i}/{len(waveforms)}")

        all_PPR.append(compute_PPR(wf))
        all_ids.append(f"{ids[i]}_train_{train_idx}")

all_PPR = np.array(all_PPR, dtype=float)
all_PPR[~np.isfinite(all_PPR)] = np.nan

df_ppr = pd.DataFrame({
    "id": all_ids,
    "PPR": all_PPR
})

output_path_ppr = os.path.join(OUTPUT_DIR, "PPR_train_all.csv")
df_ppr.to_csv(output_path_ppr, index=False)

print("\nSaved combined PPR CSV to:", output_path_ppr)
print(df_ppr.head())
print(df_ppr["PPR"].describe())
print("NaNs:", df_ppr["PPR"].isna().sum())


Loaded Train_0: 65000 waveforms
  Train_0 processing 0/65000
  Train_0 processing 5000/65000
  Train_0 processing 10000/65000
  Train_0 processing 15000/65000
  Train_0 processing 20000/65000
  Train_0 processing 25000/65000
  Train_0 processing 30000/65000
  Train_0 processing 35000/65000
  Train_0 processing 40000/65000
  Train_0 processing 45000/65000
  Train_0 processing 50000/65000
  Train_0 processing 55000/65000
  Train_0 processing 60000/65000
Loaded Train_1: 65000 waveforms
  Train_1 processing 0/65000
  Train_1 processing 5000/65000
  Train_1 processing 10000/65000
  Train_1 processing 15000/65000
  Train_1 processing 20000/65000
  Train_1 processing 25000/65000
  Train_1 processing 30000/65000
  Train_1 processing 35000/65000
  Train_1 processing 40000/65000
  Train_1 processing 45000/65000
  Train_1 processing 50000/65000
  Train_1 processing 55000/65000
  Train_1 processing 60000/65000
Loaded Train_2: 65000 waveforms
  Train_2 processing 0/65000
  Train_2 processing 5000/6

In [11]:
# PPR test
all_test_ids_ppr = []
all_PPR_test = []

for test_idx in range(6):
    test_file = f"../data/old/MJD_Test_{test_idx}.hdf5"
    if not os.path.exists(test_file):
        print(f"Skipping missing file: {test_file}")
        continue

    with h5py.File(test_file, "r") as f:
        waveforms_test = np.array(f["raw_waveform"])
        ids_test = np.array(f["id"])

    print(f"Loaded Test_{test_idx}: {len(waveforms_test)} waveforms")

    for i, wf in enumerate(waveforms_test):
        if i % 5000 == 0:
            print(f"  Test_{test_idx} processing {i}/{len(waveforms_test)}")

        all_PPR_test.append(compute_PPR(wf))
        all_test_ids_ppr.append(f"{ids_test[i]}_test_{test_idx}")

all_PPR_test = np.array(all_PPR_test, dtype=float)
all_PPR_test[~np.isfinite(all_PPR_test)] = np.nan

df_ppr_test = pd.DataFrame({
    "id": all_test_ids_ppr,
    "PPR": all_PPR_test
})

output_path_ppr_test = os.path.join(OUTPUT_DIR, "PPR_test_all.csv")
df_ppr_test.to_csv(output_path_ppr_test, index=False)

print("\nSaved combined PPR TEST CSV to:", output_path_ppr_test)
print(df_ppr_test.head())
print(df_ppr_test["PPR"].describe())
print("NaNs:", df_ppr_test["PPR"].isna().sum())


Loaded Test_0: 65000 waveforms
  Test_0 processing 0/65000
  Test_0 processing 5000/65000
  Test_0 processing 10000/65000
  Test_0 processing 15000/65000
  Test_0 processing 20000/65000
  Test_0 processing 25000/65000
  Test_0 processing 30000/65000
  Test_0 processing 35000/65000
  Test_0 processing 40000/65000
  Test_0 processing 45000/65000
  Test_0 processing 50000/65000
  Test_0 processing 55000/65000
  Test_0 processing 60000/65000
Loaded Test_1: 65000 waveforms
  Test_1 processing 0/65000
  Test_1 processing 5000/65000
  Test_1 processing 10000/65000
  Test_1 processing 15000/65000
  Test_1 processing 20000/65000
  Test_1 processing 25000/65000
  Test_1 processing 30000/65000
  Test_1 processing 35000/65000
  Test_1 processing 40000/65000
  Test_1 processing 45000/65000
  Test_1 processing 50000/65000
  Test_1 processing 55000/65000
  Test_1 processing 60000/65000
Loaded Test_2: 65000 waveforms
  Test_2 processing 0/65000
  Test_2 processing 5000/65000
  Test_2 processing 10000/