In [1]:
import sys
import os


# Get the project root (one level above notebooks/)
project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Added to Python path:", project_root)

import h5py
import numpy as np
import pandas as pd
# from src.parameters.tail_features import compute_LQ80
from scipy.fft import rfft, rfftfreq


Added to Python path: c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt


In [2]:
# train_file = "../data/old/MJD_Train_0.hdf5"

# with h5py.File(train_file, "r") as f:
#     waveforms = np.array(f["raw_waveform"])
#     ids = np.array(f["id"])

# print("Loaded", len(waveforms), "waveforms")

In [3]:
def compute_frequency_spectrum(waveform, sample_spacing=1.0):
    """
    Computes one-sided FFT amplitude spectrum of the waveform.

    Parameters
    ----------
    waveform : array-like
        The signal to transform.
    sample_spacing : float
        Time step between samples.

    Returns
    -------
    xf : np.ndarray
        Frequencies.
    amplitude : np.ndarray
        Real amplitude spectrum.
    """
    wf = np.asarray(waveform, dtype=float)
    wf = wf - np.mean(wf[:200])
    
    N = len(wf)

    yf = rfft(wf)
    xf = rfftfreq(N, d=sample_spacing)

    amplitude = np.abs(yf) * 2.0 / N
    return xf, amplitude


In [4]:
def compute_spectral_centroid(waveform, sample_spacing=1.0):
    freqs, amp = compute_frequency_spectrum(waveform, sample_spacing)

    total_amp = np.sum(amp)
    if total_amp == 0:
        return 0.0

    centroid = np.sum(freqs * amp) / total_amp
    return float(centroid)


In [5]:
# SC_values = []
# for i, wf in enumerate(waveforms):
#     if i % 5000 == 0:
#         print(f"Processing {i} / {len(waveforms)}")
#     SC_values.append(compute_spectral_centroid(wf))

# SC_values = np.array(SC_values, dtype=float)
# SC_values[~np.isfinite(SC_values)] = np.nan

In [6]:
# formatted_ids = [f"{id_}_train_0" for id_ in ids]

# df = pd.DataFrame({
#     "id": formatted_ids,
#     "SCA": SC_values
# })

In [7]:
# output_path = "SCA_train_0.csv"
# df.to_csv(output_path, index=False)

# print("Saved to", output_path)
# print(df.head())


In [8]:
# print(df["SCA"].describe())
# print("NaNs:", df["SCA"].isna().sum())

In [9]:
# import matplotlib.pyplot as plt
# import numpy as np

# vals = df["SCA"].to_numpy(dtype=float)
# vals = vals[np.isfinite(vals)]

# plt.figure()
# plt.hist(vals, bins=200)
# plt.yscale("log")
# plt.xlabel("Spectral Centroid (Amplitude-weighted)")
# plt.ylabel("Count")
# plt.title("Spectral Centroid Distribution (Train 0)")
# plt.show()


In [10]:
OUTPUT_DIR = "finalcsveunice"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [11]:

all_ids = []
all_SC = []

for train_idx in range(16):
    train_file = f"../data/old/MJD_Train_{train_idx}.hdf5"
    if not os.path.exists(train_file):
        print(f"Skipping missing file: {train_file}")
        continue

    with h5py.File(train_file, "r") as f:
        waveforms = np.array(f["raw_waveform"])
        ids = np.array(f["id"])

    print(f"Loaded Train_{train_idx}: {len(waveforms)} waveforms")

    for i, wf in enumerate(waveforms):
        if i % 5000 == 0:
            print(f"  Train_{train_idx} processing {i}/{len(waveforms)}")

        all_SC.append(compute_spectral_centroid(wf))
        all_ids.append(f"{ids[i]}_train_{train_idx}")

all_SC = np.array(all_SC, dtype=float)
all_SC[~np.isfinite(all_SC)] = np.nan

df_sc = pd.DataFrame({
    "id": all_ids,
    "SC": all_SC
})

output_path_sc = os.path.join(OUTPUT_DIR, "SC_train_all.csv")
df_sc.to_csv(output_path_sc, index=False)

print("\nSaved combined SC CSV to:", output_path_sc)
print(df_sc.head())
print(df_sc["SC"].describe())
print("NaNs:", df_sc["SC"].isna().sum())


Loaded Train_0: 65000 waveforms
  Train_0 processing 0/65000
  Train_0 processing 5000/65000
  Train_0 processing 10000/65000
  Train_0 processing 15000/65000
  Train_0 processing 20000/65000
  Train_0 processing 25000/65000
  Train_0 processing 30000/65000
  Train_0 processing 35000/65000
  Train_0 processing 40000/65000
  Train_0 processing 45000/65000
  Train_0 processing 50000/65000
  Train_0 processing 55000/65000
  Train_0 processing 60000/65000
Loaded Train_1: 65000 waveforms
  Train_1 processing 0/65000
  Train_1 processing 5000/65000
  Train_1 processing 10000/65000
  Train_1 processing 15000/65000
  Train_1 processing 20000/65000
  Train_1 processing 25000/65000
  Train_1 processing 30000/65000
  Train_1 processing 35000/65000
  Train_1 processing 40000/65000
  Train_1 processing 45000/65000
  Train_1 processing 50000/65000
  Train_1 processing 55000/65000
  Train_1 processing 60000/65000
Loaded Train_2: 65000 waveforms
  Train_2 processing 0/65000
  Train_2 processing 5000/6