In [1]:
import os
import pandas as pd
from IPython.display import display

In [2]:
print(os.getcwd())

c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt\extracted_features_csv_files


In [3]:
ls

 Volume in drive C is OS
 Volume Serial Number is 56AE-130D

 Directory of c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt\extracted_features_csv_files

01/25/2026  02:52 AM    <DIR>          .
01/21/2026  08:13 PM    <DIR>          ..
01/25/2026  02:53 AM            82,818 combine_everyone.ipynb
01/24/2026  11:49 PM    <DIR>          combined_csv_files
01/21/2026  08:12 PM               311 description.txt
01/25/2026  02:52 AM    <DIR>          eunice_csv_files
01/21/2026  08:12 PM    <DIR>          jade_csv_files
01/21/2026  08:12 PM        23,129,082 labels_test.csv
01/21/2026  08:12 PM        61,998,207 labels_train.csv
01/21/2026  08:12 PM    <DIR>          nomin_csv_files
01/24/2026  11:49 PM    <DIR>          prithvi_csv_files
               4 File(s)     85,210,418 bytes
               7 Dir(s)  105,595,662,336 bytes free


In [4]:
def process_and_merge_features(
    labels_path: str,
    eunice_path: str,
    nomin_path: str,
    prithvi_path: str,
    # jade_path: str,
    output_filename: str,
    output_dir: str = "combined_csv_files",
) -> pd.DataFrame:
    """
    Loads feature CSVs from different sources, applies specific preprocessing 
    (specifically to Jade's IDs), merges them with labels, and saves the result.
    """
    
    # Load Data
    print("=" * 50)
    print("Loading Files...")
    print("=" * 50)
    
    try:
        labels = pd.read_csv(labels_path)
        e = pd.read_csv(eunice_path)
        n = pd.read_csv(nomin_path)
        p = pd.read_csv(prithvi_path)
        # j = pd.read_csv(jade_path)
    except FileNotFoundError as err:
        print(f"Error loading files: {err}")
        return None

    # Fix Jade IDs (Specific Logic)
    # print("Preprocessing Jade IDs...")
    # g = j["id"].str.replace("id_train_", "", regex=False).astype(int)
    # file_idx = (g // 65000).astype(int)
    # j["id"] = g.astype(str) + "_train_" + file_idx.astype(str)

    # Preview Data    
    data_map = {
        "Labels": labels,
        "Eunice": e,
        "Nomin": n,
        "Prithvi": p,
        # "Jade": j
    }

    for name, df in data_map.items():
        print(f"\n{name} — File Shape: {df.shape}")
        display(df.head())

    # Merge
    print("\n" + "=" * 50)
    print("Merging Datasets...")
    
    merged = (
        labels
        .merge(e, on="id", how="inner")
        .merge(n, on="id", how="inner")
        .merge(p, on="id", how="inner")
        # .merge(j, on="id", how="inner")
    )

    print(f"Final Merged Shape: {merged.shape}")

    # Save Output
    os.makedirs(output_dir, exist_ok=True)
    out_path = os.path.join(output_dir, output_filename)
    
    print(f"Saving to {out_path}...")
    merged.to_csv(out_path, index=False, compression="gzip")
    print("Save Complete.")

    return merged

## Combine Training Files

In [5]:
paths_train = {
    "labels_path": "labels_train.csv",
    "eunice_path": "eunice_csv_files/train_csv/train_all_features.csv.gz",
    "nomin_path": "nomin_csv_files/combined_train_n.csv.gz",
    "prithvi_path": "prithvi_csv_files/train_2.csv.gz",
    # "jade_path": "jade_csv_files/train_jade_features.csv"
}

merged_df_train = process_and_merge_features(
    **paths_train,
    output_filename="combined_train_with_labels.csv.gz")

merged_df_train.head()

Loading Files...



Labels — File Shape: (1040000, 7)


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0
0,0_train_0,582.364295,False,True,True,True,957
1,1_train_0,250.159995,False,True,True,True,948
2,2_train_0,1212.323954,False,True,False,True,965
3,3_train_0,240.87811,False,True,True,False,927
4,4_train_0,285.124189,False,True,True,False,958



Eunice — File Shape: (1040000, 7)


Unnamed: 0,id,ED,HWP,LQ80,PPR,SCA,ND80
0,0_train_0,3409.0,2120.0,-635333.796311,0.719376,0.034655,0.0
1,1_train_0,3404.0,2004.0,-289756.893085,0.729709,0.035314,0.0
2,2_train_0,3411.0,2125.0,-379843.029134,0.71539,0.034915,0.0
3,3_train_0,3408.0,2098.0,-252673.626844,0.769375,0.034752,0.0
4,4_train_0,3406.0,2037.0,-317761.453582,0.728165,0.035132,0.0



Nomin — File Shape: (1040000, 9)


Unnamed: 0,id,current_skewness,spectral_centroid_power,tail_charge_diff,current_kurtosis,total_power,time_to_main_peak,time_to_peak,late_over_early
0,0_train_0,1.780811,107.276207,-0.598625,2.117825,1709302000.0,85,85,0.98791
1,1_train_0,1.756635,108.213621,-0.573984,2.058622,299137600.0,87,87,0.988301
2,2_train_0,1.079789,105.735183,-0.620023,-0.067003,624438500.0,95,95,0.987491
3,3_train_0,1.899438,107.946935,-0.571186,2.443885,281158300.0,116,116,0.98845
4,4_train_0,1.156442,106.350372,-0.563259,-0.17389,388544200.0,94,94,0.988541



Prithvi — File Shape: (1040000, 8)


Unnamed: 0,id,tdrift10,tdrift50,tdrift99,tfr,peak_count,gbn,bpr
0,0_train_0,32.0,65.0,85.0,0.142357,3,1.198436,0.059642
1,1_train_0,0.0,60.0,87.0,0.15116,3,1.381123,0.061975
2,2_train_0,35.0,67.0,95.0,0.142606,3,1.361857,0.050813
3,3_train_0,0.0,90.0,116.0,0.133192,2,1.165654,0.062954
4,4_train_0,0.0,61.0,94.0,0.145504,7,1.390918,0.055013



Merging Datasets...
Final Merged Shape: (1040000, 28)
Saving to combined_csv_files\combined_train_with_labels.csv.gz...
Save Complete.


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,time_to_main_peak,time_to_peak,late_over_early,tdrift10,tdrift50,tdrift99,tfr,peak_count,gbn,bpr
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2120.0,-635333.796311,...,85,85,0.98791,32.0,65.0,85.0,0.142357,3,1.198436,0.059642
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2004.0,-289756.893085,...,87,87,0.988301,0.0,60.0,87.0,0.15116,3,1.381123,0.061975
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2125.0,-379843.029134,...,95,95,0.987491,35.0,67.0,95.0,0.142606,3,1.361857,0.050813
3,3_train_0,240.87811,False,True,True,False,927,3408.0,2098.0,-252673.626844,...,116,116,0.98845,0.0,90.0,116.0,0.133192,2,1.165654,0.062954
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2037.0,-317761.453582,...,94,94,0.988541,0.0,61.0,94.0,0.145504,7,1.390918,0.055013


In [6]:
df_train = pd.read_csv('combined_csv_files/combined_train_with_labels.csv.gz')
df_train

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,time_to_main_peak,time_to_peak,late_over_early,tdrift10,tdrift50,tdrift99,tfr,peak_count,gbn,bpr
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2120.0,-635333.796311,...,85,85,0.987910,32.0,65.0,85.0,0.142357,3,1.198436,0.059642
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2004.0,-289756.893085,...,87,87,0.988301,0.0,60.0,87.0,0.151160,3,1.381123,0.061975
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2125.0,-379843.029134,...,95,95,0.987491,35.0,67.0,95.0,0.142606,3,1.361857,0.050813
3,3_train_0,240.878110,False,True,True,False,927,3408.0,2098.0,-252673.626844,...,116,116,0.988450,0.0,90.0,116.0,0.133192,2,1.165654,0.062954
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2037.0,-317761.453582,...,94,94,0.988541,0.0,61.0,94.0,0.145504,7,1.390918,0.055013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039995,1039995_train_15,210.834626,True,True,True,True,949,3403.0,2048.0,-240759.341995,...,90,90,0.985736,0.0,55.0,90.0,0.151747,6,1.008092,0.062825
1039996,1039996_train_15,380.170340,True,True,True,True,949,3404.0,2045.0,-417622.302428,...,82,82,0.987814,0.0,57.0,79.0,0.150305,3,1.245492,0.062706
1039997,1039997_train_15,370.109563,False,True,True,True,947,3409.0,2063.0,-407730.168602,...,102,102,0.987833,0.0,77.0,102.0,0.142950,2,1.149456,0.062153
1039998,1039998_train_15,98.258524,True,True,True,False,921,3400.0,2001.0,-116570.568893,...,92,92,0.989948,0.0,71.0,92.0,0.153135,5,1.192639,0.065264


In [7]:
df_train.describe()

Unnamed: 0,energy_label,tp0,ED,HWP,LQ80,PPR,SCA,ND80,current_skewness,spectral_centroid_power,...,time_to_main_peak,time_to_peak,late_over_early,tdrift10,tdrift50,tdrift99,tfr,peak_count,gbn,bpr
count,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0,1039995.0,1040000.0,1040000.0,1040000.0,1040000.0,...,1040000.0,1040000.0,1040000.0,1039995.0,1039995.0,1039995.0,1040000.0,1040000.0,1040000.0,1040000.0
mean,638.1633,951.6434,3406.036,2054.12,-697895.0,0.7254552,0.03572992,0.003009126,1.906792,114.5299,...,108.2445,114.4218,0.989365,19.87803,69.81389,113.1437,0.1495163,5.104236,1.777303,0.06049704
std,670.4271,18.95332,7.735526,129.672,730106.4,0.02832694,0.006330801,0.04127131,0.5312883,106.9477,...,34.8659,72.94667,0.1121635,24.97331,29.34827,72.87391,0.03987192,7.352031,1.174284,0.02152323
min,0.0,71.0,3074.0,72.0,-5846630.0,-7.703542,0.02676319,0.0,-1.809541,89.86718,...,1.0,-804.0,0.1302615,0.0,0.0,9.0,0.01968922,1.0,0.6992497,0.0363653
25%,238.3612,942.0,3404.0,2032.0,-698412.6,0.7088948,0.03452545,0.0,1.584701,106.7439,...,87.0,87.0,0.9873562,0.0,54.0,86.0,0.140839,1.0,1.116426,0.05533778
50%,341.1866,954.0,3407.0,2066.0,-376955.4,0.7211736,0.03480813,0.0,1.93916,108.6075,...,103.0,104.0,0.9879051,8.0,69.0,102.0,0.1457415,2.0,1.32193,0.06133205
75%,630.0128,964.0,3410.0,2099.0,-265472.0,0.7361495,0.0352403,0.0,2.253204,110.3433,...,122.0,123.0,0.988553,36.0,83.0,121.0,0.1509235,5.0,1.908089,0.06423424
max,4998.508,1352.0,3790.0,3796.0,36754920.0,0.9768917,0.2028693,2.193201,5.229496,16374.42,...,496.0,2919.0,46.40625,2666.0,2839.0,2919.0,3.990051,55.0,36.67323,2.569411


## Combine Test Files

In [8]:
paths_test = {
    "labels_path": "labels_test.csv",
    "eunice_path": "eunice_csv_files/test_csv/test_all_features.csv.gz",
    "nomin_path": "nomin_csv_files/combined_test_n.csv.gz",
    "prithvi_path": "prithvi_csv_files/test_2.csv.gz",
    # "jade_path": "jade_csv_files/test_jade_features.csv"
}

merged_df_test = process_and_merge_features(
    **paths_test,
    output_filename="combined_test_with_labels.csv.gz")

merged_df_test.head()

Loading Files...
Error loading files: [Errno 2] No such file or directory: 'eunice_csv_files/test_csv/test_all_features.csv.gz'


AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
df_test = pd.read_csv('combined_csv_files/combined_test_with_labels.csv.gz')
df_test

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,spectral_centroid_power,tail_charge_diff,late_over_early,tdrift10,tdrift50,tdrift99,tfr,peak_count,gbn,bpr


In [None]:
df_test.describe()

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,spectral_centroid_power,tail_charge_diff,late_over_early,tdrift10,tdrift50,tdrift99,tfr,peak_count,gbn,bpr
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
top,,,,,,,,,,,...,,,,,,,,,,
freq,,,,,,,,,,,...,,,,,,,,,,
