In [24]:
import sys
import os
from scipy.optimize import curve_fit

# Get the project root (one level above notebooks/)
project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Added to Python path:", project_root)

import h5py
import numpy as np
import pandas as pd
# from src.parameters.tail_features import compute_LQ80


Added to Python path: c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt


In [25]:
# train_file = "../data/old/MJD_Train_0.hdf5"

# with h5py.File(train_file, "r") as f:
#     waveforms = np.array(f["raw_waveform"])
#     ids = np.array(f["id"])

# print("Loaded", len(waveforms), "waveforms")

In [26]:
def estimate_baseline(y, n_samples=200):
    """
    Returns baseline (mean, std) from the first n_samples.
    """
    y0 = np.asarray(y, dtype=float)[:n_samples]
    return float(np.mean(y0)), float(np.std(y0))


In [27]:
def compute_energy_duration(waveform, threshold=0.9, n_baseline=200):
    """
    Returns the number of samples needed to reach `threshold`
    fraction of total baseline-subtracted squared energy.
    """
    y = np.asarray(waveform, dtype=float)

    # subtract baseline (simple and robust)
    baseline = np.mean(y[:n_baseline])
    y = y - baseline

    # squared energy
    energy = y ** 2
    total_energy = float(np.sum(energy))
    if total_energy <= 0 or not np.isfinite(total_energy):
        return np.nan

    cumulative = np.cumsum(energy)
    target = threshold * total_energy

    idxs = np.where(cumulative >= target)[0]
    if len(idxs) == 0:
        return np.nan

    return int(idxs[0])


In [28]:
# # compute Energy Duration for everything
# ED_values = []

# for i, wf in enumerate(waveforms):
#     if i % 5000 == 0:
#         print(f"Processing {i} / {len(waveforms)}")
#     ED_values.append(compute_energy_duration(wf))

# ED_values = np.array(ED_values, dtype=float)
# ED_values[~np.isfinite(ED_values)] = np.nan

In [29]:
# # output
# formatted_ids = [f"{id_}_train_0" for id_ in ids]

# df = pd.DataFrame({
#     "id": formatted_ids,
#     "ED": ED_values
# })


In [30]:

# output_path = "ED_train_0.csv"
# df.to_csv(output_path, index=False)
# print("Saved to", output_path)
# print(df.head())

In [31]:
# print(df["ED"].describe())
# print("NaNs:", df["ED"].isna().sum())

In [32]:
# import matplotlib.pyplot as plt
# import numpy as np

# vals = df["ED"].to_numpy(dtype=float)
# vals = vals[np.isfinite(vals)]

# plt.figure()
# plt.hist(vals, bins=200)
# plt.xlabel("Energy Duration (index)")
# plt.ylabel("Count")
# plt.title("Energy Duration Distribution (Train 0)")
# plt.show()


In [33]:
OUTPUT_DIR = "finalcsveunice"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [34]:
all_ids_ed = []
all_ED = []

for train_idx in range(16):
    train_file = f"../data/old/MJD_Train_{train_idx}.hdf5"
    if not os.path.exists(train_file):
        print(f"Skipping missing file: {train_file}")
        continue

    print(f"\nLoading {train_file}")

    with h5py.File(train_file, "r") as f:
        waveforms = np.array(f["raw_waveform"])
        ids = np.array(f["id"])

    print(f"  Waveforms: {len(waveforms)}")

    for i, wf in enumerate(waveforms):
        if i % 5000 == 0:
            print(f"    ED Train_{train_idx} {i}/{len(waveforms)}")

        all_ED.append(compute_energy_duration(wf))
        all_ids_ed.append(f"{ids[i]}_train_{train_idx}")

all_ED = np.array(all_ED, dtype=float)
all_ED[~np.isfinite(all_ED)] = np.nan

df_ed = pd.DataFrame({
    "id": all_ids_ed,
    "ED": all_ED
})

output_path_ed = os.path.join(OUTPUT_DIR, "ED_train_all.csv")
df_ed.to_csv(output_path_ed, index=False)


print("\nSaved combined ED CSV to:", output_path_ed)
print(df_ed.head())
print(df_ed["ED"].describe())
print("NaNs:", df_ed["ED"].isna().sum())



Loading ../data/old/MJD_Train_0.hdf5
  Waveforms: 65000
    ED Train_0 0/65000
    ED Train_0 5000/65000
    ED Train_0 10000/65000
    ED Train_0 15000/65000
    ED Train_0 20000/65000
    ED Train_0 25000/65000
    ED Train_0 30000/65000
    ED Train_0 35000/65000
    ED Train_0 40000/65000
    ED Train_0 45000/65000
    ED Train_0 50000/65000
    ED Train_0 55000/65000
    ED Train_0 60000/65000

Loading ../data/old/MJD_Train_1.hdf5
  Waveforms: 65000
    ED Train_1 0/65000
    ED Train_1 5000/65000
    ED Train_1 10000/65000
    ED Train_1 15000/65000
    ED Train_1 20000/65000
    ED Train_1 25000/65000
    ED Train_1 30000/65000
    ED Train_1 35000/65000
    ED Train_1 40000/65000
    ED Train_1 45000/65000
    ED Train_1 50000/65000
    ED Train_1 55000/65000
    ED Train_1 60000/65000

Loading ../data/old/MJD_Train_2.hdf5
  Waveforms: 65000
    ED Train_2 0/65000
    ED Train_2 5000/65000
    ED Train_2 10000/65000
    ED Train_2 15000/65000
    ED Train_2 20000/65000
    ED 