In [1]:
import os
import pandas as pd
from IPython.display import display

In [2]:
print(os.getcwd())

c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt\extracted_features_csv_files


In [3]:
ls

 Volume in drive C is OS
 Volume Serial Number is 56AE-130D

 Directory of c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt\extracted_features_csv_files

02/18/2026  07:48 PM    <DIR>          .
02/18/2026  05:10 PM    <DIR>          ..
02/18/2026  07:48 PM           118,698 combine_everyone.ipynb
02/18/2026  05:10 PM    <DIR>          combined_csv_files
01/21/2026  08:12 PM               311 description.txt
01/25/2026  04:37 PM    <DIR>          eunice_csv_files
02/18/2026  05:10 PM    <DIR>          jade_csv_files
01/21/2026  08:12 PM        23,129,082 labels_test.csv
01/21/2026  08:12 PM        61,998,207 labels_train.csv
02/18/2026  05:10 PM    <DIR>          nomin_csv_files
02/18/2026  05:10 PM            59,765 oldcombine.ipynb
02/18/2026  05:10 PM    <DIR>          prithvi_csv_files
               5 File(s)     85,306,063 bytes
               7 Dir(s)  107,492,405,248 bytes free


In [4]:
def process_and_merge_features(
    labels_path: str,
    eunice_path: str,
    nomin_path: str,
    prithvi_path: str,
    jade_path: str,
    output_filename: str,
    output_dir: str = "combined_csv_files",
) -> pd.DataFrame:
    """
    Loads feature CSVs from different sources, applies specific preprocessing 
    (specifically to Jade's IDs), merges them with labels, and saves the result.
    """
    
    # Load Data
    print("=" * 50)
    print("Loading Files...")
    print("=" * 50)
    
    try:
        labels = pd.read_csv(labels_path)
        e = pd.read_csv(eunice_path)
        n = pd.read_csv(nomin_path)
        p = pd.read_csv(prithvi_path)
        j = pd.read_csv(jade_path)
    except FileNotFoundError as err:
        print(f"Error loading files: {err}")
        return None


    # Preview Data    
    data_map = {
        "Labels": labels,
        "Eunice": e,
        "Nomin": n,
        "Prithvi": p,
        "Jade": j
    }

    for name, df in data_map.items():
        print(f"\n{name} — File Shape: {df.shape}")
        display(df.head())

    # Merge
    print("\n" + "=" * 50)
    print("Merging Datasets...")
    
    merged = (
        labels
        .merge(e, on="id", how="inner")
        .merge(n, on="id", how="inner")
        .merge(p, on="id", how="inner")
        .merge(j, on="id", how="inner")
    )

    print(f"Final Merged Shape: {merged.shape}")

    # Save Output
    os.makedirs(output_dir, exist_ok=True)
    out_path = os.path.join(output_dir, output_filename)
    
    print(f"Saving to {out_path}...")
    merged.to_csv(out_path, index=False, compression="gzip")
    print("Save Complete.")

    return merged

## Combine Training Files

In [5]:
paths_train = {
    "labels_path": "labels_train.csv",
    "eunice_path": "eunice_csv_files/train_csv/train_all_features.csv.gz",
    "nomin_path": "nomin_csv_files/combined_train_n.csv.gz",
    "prithvi_path": "prithvi_csv_files/train_2.csv.gz",
    "jade_path": "jade_csv_files/train_jade_features.csv.gz"
}

merged_df_train = process_and_merge_features(
    **paths_train,
    output_filename="combined_train_with_labels.csv.gz")

merged_df_train.head()

Loading Files...



Labels — File Shape: (1040000, 7)


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0
0,0_train_0,582.364295,False,True,True,True,957
1,1_train_0,250.159995,False,True,True,True,948
2,2_train_0,1212.323954,False,True,False,True,965
3,3_train_0,240.87811,False,True,True,False,927
4,4_train_0,285.124189,False,True,True,False,958



Eunice — File Shape: (1040000, 6)


Unnamed: 0,id,ED,HWP,LQ80,PPR,SC
0,0_train_0,3409.0,2299.0,-717094.898532,0.719376,0.034655
1,1_train_0,3404.0,2446.0,-331957.541919,0.729709,0.035314
2,2_train_0,3411.0,2262.0,-425532.152706,0.71539,0.034915
3,3_train_0,3408.0,2833.0,-306980.459766,0.769375,0.034752
4,4_train_0,3406.0,2397.0,-362746.925366,0.728165,0.035132



Nomin — File Shape: (1040000, 9)


Unnamed: 0,id,current_skewness,spectral_centroid_power,tail_charge_diff,current_kurtosis,total_power,time_to_main_peak,time_to_peak,late_over_early
0,0_train_0,1.780811,107.276207,-0.598625,2.117825,1709302000.0,85,85,0.98791
1,1_train_0,1.756635,108.213621,-0.573984,2.058622,299137600.0,87,87,0.988301
2,2_train_0,1.079789,105.735183,-0.620023,-0.067003,624438500.0,95,95,0.987491
3,3_train_0,1.899438,107.946935,-0.571186,2.443885,281158300.0,116,116,0.98845
4,4_train_0,1.156442,106.350372,-0.563259,-0.17389,388544200.0,94,94,0.988541



Prithvi — File Shape: (1040000, 6)


Unnamed: 0,id,tdrift99,tfr,peak_count,gbn,bpr
0,0_train_0,85.0,0.142357,3,1.198436,0.059642
1,1_train_0,87.0,0.15116,3,1.381123,0.061975
2,2_train_0,95.0,0.142606,3,1.361857,0.050813
3,3_train_0,116.0,0.133192,2,1.165654,0.062954
4,4_train_0,94.0,0.145504,7,1.390918,0.055013



Jade — File Shape: (1040000, 5)


Unnamed: 0,id,AvsE,GradAreaRatio,GradWidthMain,HFER
0,0_train_0,0.519805,1.0,76.0,0.034772
1,1_train_0,0.531775,1.0,46.0,0.037621
2,2_train_0,0.343676,1.0,90.0,0.035552
3,3_train_0,0.483144,1.0,56.0,0.035093
4,4_train_0,0.377738,1.0,53.0,0.035971



Merging Datasets...
Final Merged Shape: (1040000, 29)
Saving to combined_csv_files\combined_train_with_labels.csv.gz...
Save Complete.


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2299.0,-717094.898532,...,0.98791,85.0,0.142357,3,1.198436,0.059642,0.519805,1.0,76.0,0.034772
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2446.0,-331957.541919,...,0.988301,87.0,0.15116,3,1.381123,0.061975,0.531775,1.0,46.0,0.037621
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2262.0,-425532.152706,...,0.987491,95.0,0.142606,3,1.361857,0.050813,0.343676,1.0,90.0,0.035552
3,3_train_0,240.87811,False,True,True,False,927,3408.0,2833.0,-306980.459766,...,0.98845,116.0,0.133192,2,1.165654,0.062954,0.483144,1.0,56.0,0.035093
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2397.0,-362746.925366,...,0.988541,94.0,0.145504,7,1.390918,0.055013,0.377738,1.0,53.0,0.035971


In [6]:
df_train = pd.read_csv('combined_csv_files/combined_train_with_labels.csv.gz')
df_train

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2299.0,-717094.898532,...,0.987910,85.0,0.142357,3,1.198436,0.059642,0.519805,1.000000,76.0,0.034772
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2446.0,-331957.541919,...,0.988301,87.0,0.151160,3,1.381123,0.061975,0.531775,1.000000,46.0,0.037621
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2262.0,-425532.152706,...,0.987491,95.0,0.142606,3,1.361857,0.050813,0.343676,1.000000,90.0,0.035552
3,3_train_0,240.878110,False,True,True,False,927,3408.0,2833.0,-306980.459766,...,0.988450,116.0,0.133192,2,1.165654,0.062954,0.483144,1.000000,56.0,0.035093
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2397.0,-362746.925366,...,0.988541,94.0,0.145504,7,1.390918,0.055013,0.377738,1.000000,53.0,0.035971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039995,1039995_train_15,210.834626,True,True,True,True,949,3403.0,2533.0,-278607.352936,...,0.985736,90.0,0.151747,6,1.008092,0.062825,0.573645,33.442724,69.0,0.035228
1039996,1039996_train_15,380.170340,True,True,True,True,949,3404.0,2426.0,-477777.966558,...,0.987814,79.0,0.150305,3,1.245492,0.062706,0.564955,58.862967,71.0,0.034669
1039997,1039997_train_15,370.109563,False,True,True,True,947,3409.0,2417.0,-466589.983952,...,0.987833,102.0,0.142950,2,1.149456,0.062153,0.473789,143.746154,79.0,0.034553
1039998,1039998_train_15,98.258524,True,True,True,False,921,3400.0,3799.0,-143212.214717,...,0.989948,92.0,0.153135,5,1.192639,0.065264,0.572606,1.000000,42.0,0.039377


In [7]:
df_train.describe()

Unnamed: 0,energy_label,tp0,ED,HWP,LQ80,PPR,SC,current_skewness,spectral_centroid_power,tail_charge_diff,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
count,1040000.0,1040000.0,1040000.0,1039995.0,1040000.0,1039995.0,1040000.0,1040000.0,1040000.0,1040000.0,...,1040000.0,1039995.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0
mean,638.1633,951.6434,3406.036,2423.961,1.583331e+124,0.7254552,0.03572992,1.906792,114.5299,-0.5468692,...,0.989365,113.1437,0.1495163,5.104236,1.777303,0.06049704,0.4964875,409133700000.0,67.92718,0.03643588
std,670.4271,18.95332,7.735526,392.52,1.614687e+127,0.02832694,0.006330801,0.5312883,106.9477,1.561871,...,0.1121635,72.87391,0.03987192,7.352031,1.174284,0.02152323,0.09173632,1401446000000.0,24.42059,0.008420477
min,0.0,71.0,3074.0,2.0,-5949322.0,-7.703542,0.02676319,-1.809541,89.86718,-8.692308,...,0.1302615,9.0,0.01968922,1.0,0.6992497,0.0363653,0.03150645,1.0,5.0,0.02650746
25%,238.3612,942.0,3404.0,2191.0,-777411.7,0.7088948,0.03452545,1.584701,106.7439,-0.6236,...,0.9873562,86.0,0.140839,1.0,1.116426,0.05533778,0.4353313,1.0,51.0,0.03454584
50%,341.1866,954.0,3407.0,2325.0,-427956.0,0.7211736,0.03480813,1.93916,108.6075,-0.5968707,...,0.9879051,102.0,0.1457415,2.0,1.32193,0.06133205,0.5268103,1.0,65.0,0.03505329
75%,630.0128,964.0,3410.0,2510.0,-306392.1,0.7361495,0.0352403,2.253204,110.3433,-0.5640222,...,0.988553,121.0,0.1509235,5.0,1.908089,0.06423424,0.5653386,65.88437,82.0,0.03580735
max,4998.508,1352.0,3790.0,3799.0,1.646664e+130,0.9768917,0.2028693,5.229496,16374.42,311.25,...,46.40625,2919.0,3.990051,55.0,36.67323,2.569411,0.9706884,7968905000000.0,391.0,0.2743541


In [8]:
df_train.describe(include=['bool'])

Unnamed: 0,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq
count,1040000,1040000,1040000,1040000
unique,2,2,2,2
top,False,True,True,True
freq,575257,1031559,1019930,736130


## Combine Test Files

In [9]:
paths_test = {
    "labels_path": "labels_test.csv",
    "eunice_path": "eunice_csv_files/test_csv/test_all_features.csv.gz",
    "nomin_path": "nomin_csv_files/combined_test_n.csv.gz",
    "prithvi_path": "prithvi_csv_files/test_2.csv.gz",
    "jade_path": "jade_csv_files/test_jade_features.csv.gz"
}

merged_df_test = process_and_merge_features(
    **paths_test,
    output_filename="combined_test_with_labels.csv.gz")

merged_df_test.head()

Loading Files...

Labels — File Shape: (390000, 7)


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0
0,2395098_test_0,1167.174731,True,True,True,True,967
1,2395099_test_0,870.765543,False,True,True,False,960
2,2395100_test_0,582.980526,False,True,True,True,960
3,2395101_test_0,238.918902,True,True,True,True,930
4,2395102_test_0,214.491195,False,True,True,True,924



Eunice — File Shape: (390000, 7)


Unnamed: 0,id,ED,HWP,LQ80,PPR,SCA,ND80
0,2395098_test_0,3407.0,2036.0,-1300536.0,0.692435,0.034359,0.0
1,2395099_test_0,3405.0,2019.0,-972982.2,0.690517,0.034267,0.0
2,2395100_test_0,3412.0,2107.0,-639087.0,0.700524,0.034659,0.0
3,2395101_test_0,3408.0,2053.0,-276046.0,0.69045,0.034947,0.0
4,2395102_test_0,3406.0,1939.0,-76111.88,0.677887,0.040895,0.0



Nomin — File Shape: (390000, 9)


Unnamed: 0,id,total_power,time_to_main_peak,current_skewness,current_kurtosis,time_to_peak,spectral_centroid_power,tail_charge_diff,late_over_early
0,2395098_test_0,6586151000.0,77,1.715723,1.768231,77,108.796954,-0.621062,0.987423
1,2395099_test_0,3600784000.0,99,2.164451,3.329243,99,110.305348,-0.597317,0.987924
2,2395100_test_0,1707060000.0,100,1.596112,1.171031,100,108.213213,-0.590051,0.988065
3,2395101_test_0,284251400.0,110,2.34598,4.571925,110,109.196803,-0.606667,0.987626
4,2395102_test_0,19579190.0,135,1.911722,2.328352,135,109.163049,-0.6625,0.986214



Prithvi — File Shape: (390000, 6)


Unnamed: 0,id,tdrift99,tfr,peak_count,gbn,bpr
0,2395098_test_0,76.0,0.148591,1,1.282821,0.062497
1,2395099_test_0,90.0,0.147334,1,1.20364,0.063886
2,2395100_test_0,100.0,0.13909,2,1.18932,0.057715
3,2395101_test_0,110.0,0.138597,2,1.226275,0.067184
4,2395102_test_0,135.0,0.149185,13,1.203153,0.055602



Jade — File Shape: (390000, 5)


Unnamed: 0,id,AvsE,GradAreaRatio,GradWidthMain,HFER
0,2395098_test_0,0.553581,225.7453,69.0,0.034373
1,2395099_test_0,0.57309,2120524000000.0,88.0,0.034307
2,2395100_test_0,0.443685,178.4824,84.0,0.034885
3,2395101_test_0,0.583046,1.0,53.0,0.035474
4,2395102_test_0,0.403248,1.0,46.0,0.051582



Merging Datasets...
Final Merged Shape: (390000, 30)
Saving to combined_csv_files\combined_test_with_labels.csv.gz...
Save Complete.


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
0,2395098_test_0,1167.174731,True,True,True,True,967,3407.0,2036.0,-1300536.0,...,0.987423,76.0,0.148591,1,1.282821,0.062497,0.553581,225.7453,69.0,0.034373
1,2395099_test_0,870.765543,False,True,True,False,960,3405.0,2019.0,-972982.2,...,0.987924,90.0,0.147334,1,1.20364,0.063886,0.57309,2120524000000.0,88.0,0.034307
2,2395100_test_0,582.980526,False,True,True,True,960,3412.0,2107.0,-639087.0,...,0.988065,100.0,0.13909,2,1.18932,0.057715,0.443685,178.4824,84.0,0.034885
3,2395101_test_0,238.918902,True,True,True,True,930,3408.0,2053.0,-276046.0,...,0.987626,110.0,0.138597,2,1.226275,0.067184,0.583046,1.0,53.0,0.035474
4,2395102_test_0,214.491195,False,True,True,True,924,3406.0,1939.0,-76111.88,...,0.986214,135.0,0.149185,13,1.203153,0.055602,0.403248,1.0,46.0,0.051582


In [10]:
df_test = pd.read_csv('combined_csv_files/combined_test_with_labels.csv.gz')
df_test

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
0,2395098_test_0,1167.174731,True,True,True,True,967,3407.0,2036.0,-1.300536e+06,...,0.987423,76.0,0.148591,1,1.282821,0.062497,0.553581,2.257453e+02,69.0,0.034373
1,2395099_test_0,870.765543,False,True,True,False,960,3405.0,2019.0,-9.729822e+05,...,0.987924,90.0,0.147334,1,1.203640,0.063886,0.573090,2.120524e+12,88.0,0.034307
2,2395100_test_0,582.980526,False,True,True,True,960,3412.0,2107.0,-6.390870e+05,...,0.988065,100.0,0.139090,2,1.189320,0.057715,0.443685,1.784824e+02,84.0,0.034885
3,2395101_test_0,238.918902,True,True,True,True,930,3408.0,2053.0,-2.760460e+05,...,0.987626,110.0,0.138597,2,1.226275,0.067184,0.583046,1.000000e+00,53.0,0.035474
4,2395102_test_0,214.491195,False,True,True,True,924,3406.0,1939.0,-7.611188e+04,...,0.986214,135.0,0.149185,13,1.203153,0.055602,0.403248,1.000000e+00,46.0,0.051582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389995,2785093_test_5,238.472881,False,True,True,False,940,3406.0,2081.0,-2.667054e+05,...,0.987577,111.0,0.144375,2,1.198436,0.062338,0.518905,1.000000e+00,73.0,0.034997
389996,2785094_test_5,452.840234,True,True,True,True,952,3406.0,2058.0,-5.194853e+05,...,0.988177,83.0,0.148058,3,2.268535,0.064586,0.590262,5.597816e+01,54.0,0.035930
389997,2785095_test_5,344.740556,True,True,True,True,948,3406.0,2036.0,-3.684673e+05,...,0.987142,125.0,0.140774,3,1.217066,0.062375,0.536752,1.000000e+00,82.0,0.034713
389998,2785096_test_5,163.807547,True,True,True,True,925,3406.0,2067.0,-1.903450e+05,...,0.989035,116.0,0.140754,2,0.975055,0.066704,0.577498,1.000000e+00,41.0,0.036159


In [11]:
df_test.describe()

Unnamed: 0,energy_label,tp0,ED,HWP,LQ80,PPR,SCA,ND80,total_power,time_to_main_peak,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
count,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,...,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0,390000.0
mean,636.420597,951.667418,3406.022251,2053.719892,-696182.9,0.689568,0.035721,0.002938,4135452000.0,108.175585,...,0.989258,112.998205,0.14947,5.092323,1.778528,0.060501,0.496437,406196400000.0,67.902682,0.036423
std,669.051804,18.715776,7.745847,130.787881,727529.1,0.020824,0.006291,0.040122,8268203000.0,34.692002,...,0.111025,72.299817,0.038967,7.320057,1.179948,0.021937,0.09169,1396748000000.0,24.343887,0.008374
min,0.0,105.0,3194.0,18.0,-5045836.0,0.119332,0.02906,0.0,9771.622,2.0,...,0.305589,31.0,0.026388,1.0,0.689202,0.036675,0.031747,1.0,5.0,0.028515
25%,238.364583,942.0,3404.0,2032.0,-694243.0,0.688435,0.034525,0.0,273496500.0,87.0,...,0.987355,86.0,0.140847,1.0,1.116218,0.055314,0.434686,1.0,51.0,0.034545
50%,341.175226,954.0,3407.0,2066.0,-376901.2,0.693434,0.034808,0.0,564525400.0,103.0,...,0.987902,103.0,0.14575,2.0,1.321458,0.061334,0.526955,1.0,65.0,0.035055
75%,624.133505,964.0,3410.0,2099.0,-265563.0,0.697469,0.035238,0.0,1877164000.0,122.0,...,0.988546,121.0,0.150928,5.0,1.9046,0.064239,0.565397,65.74662,82.0,0.03581
max,4177.604856,1351.0,3796.0,3794.0,62596.56,0.986577,0.192336,1.9611,52727430000.0,492.0,...,41.649999,2878.0,2.903741,53.0,14.526829,2.387268,0.895246,8081810000000.0,363.0,0.264706


In [12]:
df_test.describe(include=['bool'])

Unnamed: 0,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq
count,390000,390000,390000,390000
unique,2,2,2,2
top,False,True,True,True
freq,215707,386840,382574,275772
