In [1]:
import sys
import os


# Get the project root (one level above notebooks/)
project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Added to Python path:", project_root)

import h5py
import numpy as np
import pandas as pd
# from src.parameters.tail_features import compute_LQ80
from scipy.fft import rfft, rfftfreq


Added to Python path: c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt


In [2]:
# train_file = "../data/old/MJD_Train_0.hdf5"

# with h5py.File(train_file, "r") as f:
#     waveforms = np.array(f["raw_waveform"])
#     ids = np.array(f["id"])

# print("Loaded", len(waveforms), "waveforms")

In [3]:
def compute_frequency_spectrum(waveform, sample_spacing=1.0):
    """
    Computes one-sided FFT amplitude spectrum of the waveform.

    Parameters
    ----------
    waveform : array-like
        The signal to transform.
    sample_spacing : float
        Time step between samples.

    Returns
    -------
    xf : np.ndarray
        Frequencies.
    amplitude : np.ndarray
        Real amplitude spectrum.
    """
    wf = np.asarray(waveform, dtype=float)
    wf = wf - np.mean(wf[:200])
    
    N = len(wf)

    yf = rfft(wf)
    xf = rfftfreq(N, d=sample_spacing)

    amplitude = np.abs(yf) * 2.0 / N
    return xf, amplitude


In [4]:
def compute_spectral_centroid(waveform, sample_spacing=1.0):
    freqs, amp = compute_frequency_spectrum(waveform, sample_spacing)

    total_amp = np.sum(amp)
    if total_amp == 0:
        return 0.0

    centroid = np.sum(freqs * amp) / total_amp
    return float(centroid)


In [5]:
OUTPUT_DIR = "finalcsveunice"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [6]:
all_train_ids_sca = []
all_SCA_train = []

for train_idx in range(16):
    train_file = f"../data/old/MJD_Train_{train_idx}.hdf5"
    if not os.path.exists(train_file):
        print(f"Skipping missing file: {train_file}")
        continue

    with h5py.File(train_file, "r") as f:
        waveforms_train = np.array(f["raw_waveform"])
        ids_train = np.array(f["id"])

    print(f"Loaded Train_{train_idx}: {len(waveforms_train)} waveforms")

    for i, wf in enumerate(waveforms_train):
        if i % 5000 == 0:
            print(f"  Train_{train_idx} processing {i}/{len(waveforms_train)}")

        all_SCA_train.append(compute_spectral_centroid(wf))
        all_train_ids_sca.append(f"{ids_train[i]}_train_{train_idx}")

all_SCA_train = np.array(all_SCA_train, dtype=float)
all_SCA_train[~np.isfinite(all_SCA_train)] = np.nan

df_sca_train = pd.DataFrame({
    "id": all_train_ids_sca,
    "SCA": all_SCA_train
})

output_path_sca_train = os.path.join(OUTPUT_DIR, "SCA_train_all.csv")
df_sca_train.to_csv(output_path_sca_train, index=False)

print("\nSaved combined SCA TRAIN CSV to:", output_path_sca_train)
print(df_sca_train.head())
print(df_sca_train["SCA"].describe())
print("NaNs:", df_sca_train["SCA"].isna().sum())


Loaded Train_0: 65000 waveforms
  Train_0 processing 0/65000
  Train_0 processing 5000/65000
  Train_0 processing 10000/65000
  Train_0 processing 15000/65000
  Train_0 processing 20000/65000
  Train_0 processing 25000/65000
  Train_0 processing 30000/65000
  Train_0 processing 35000/65000
  Train_0 processing 40000/65000
  Train_0 processing 45000/65000
  Train_0 processing 50000/65000
  Train_0 processing 55000/65000
  Train_0 processing 60000/65000
Loaded Train_1: 65000 waveforms
  Train_1 processing 0/65000
  Train_1 processing 5000/65000
  Train_1 processing 10000/65000
  Train_1 processing 15000/65000
  Train_1 processing 20000/65000
  Train_1 processing 25000/65000
  Train_1 processing 30000/65000
  Train_1 processing 35000/65000
  Train_1 processing 40000/65000
  Train_1 processing 45000/65000
  Train_1 processing 50000/65000
  Train_1 processing 55000/65000
  Train_1 processing 60000/65000
Loaded Train_2: 65000 waveforms
  Train_2 processing 0/65000
  Train_2 processing 5000/6

In [7]:
all_test_ids_sca = []
all_SCA_test = []

for test_idx in range(6):
    test_file = f"../data/old/MJD_Test_{test_idx}.hdf5"
    if not os.path.exists(test_file):
        print(f"Skipping missing file: {test_file}")
        continue

    with h5py.File(test_file, "r") as f:
        waveforms_test = np.array(f["raw_waveform"])
        ids_test = np.array(f["id"])

    print(f"Loaded Test_{test_idx}: {len(waveforms_test)} waveforms")

    for i, wf in enumerate(waveforms_test):
        if i % 5000 == 0:
            print(f"  Test_{test_idx} processing {i}/{len(waveforms_test)}")

        all_SCA_test.append(compute_spectral_centroid(wf))
        all_test_ids_sca.append(f"{ids_test[i]}_test_{test_idx}")

all_SCA_test = np.array(all_SCA_test, dtype=float)
all_SCA_test[~np.isfinite(all_SCA_test)] = np.nan

df_sca_test = pd.DataFrame({
    "id": all_test_ids_sca,
    "SCA": all_SCA_test
})

output_path_sca_test = os.path.join(OUTPUT_DIR, "SCA_test_all.csv")
df_sca_test.to_csv(output_path_sca_test, index=False)

print("\nSaved combined SCA TEST CSV to:", output_path_sca_test)
print(df_sca_test.head())
print(df_sca_test["SCA"].describe())
print("NaNs:", df_sca_test["SCA"].isna().sum())

Loaded Test_0: 65000 waveforms
  Test_0 processing 0/65000
  Test_0 processing 5000/65000
  Test_0 processing 10000/65000
  Test_0 processing 15000/65000
  Test_0 processing 20000/65000
  Test_0 processing 25000/65000
  Test_0 processing 30000/65000
  Test_0 processing 35000/65000
  Test_0 processing 40000/65000
  Test_0 processing 45000/65000
  Test_0 processing 50000/65000
  Test_0 processing 55000/65000
  Test_0 processing 60000/65000
Loaded Test_1: 65000 waveforms
  Test_1 processing 0/65000
  Test_1 processing 5000/65000
  Test_1 processing 10000/65000
  Test_1 processing 15000/65000
  Test_1 processing 20000/65000
  Test_1 processing 25000/65000
  Test_1 processing 30000/65000
  Test_1 processing 35000/65000
  Test_1 processing 40000/65000
  Test_1 processing 45000/65000
  Test_1 processing 50000/65000
  Test_1 processing 55000/65000
  Test_1 processing 60000/65000
Loaded Test_2: 65000 waveforms
  Test_2 processing 0/65000
  Test_2 processing 5000/65000
  Test_2 processing 10000/