In [1]:
import os
import pandas as pd
from IPython.display import display

In [2]:
print(os.getcwd())

/home/nbatjargal/private/capstone_project/Majorana-Neutrino-Hunt/extracted_features_csv_files


In [3]:
ls

combine_everyone.ipynb  [0m[01;34meunice_csv_files[0m/  labels_train.csv
[01;34mcombined_csv_files[0m/     [01;34mjade_csv_files[0m/    [01;34mnomin_csv_files[0m/
description.txt         labels_test.csv    [01;34mprithvi_csv_files[0m/


In [7]:
def process_and_merge_features(
    labels_path: str,
    eunice_path: str,
    nomin_path: str,
    prithvi_path: str,
    jade_path: str,
    output_filename: str,
    output_dir: str = "combined_csv_files",
) -> pd.DataFrame:
    """
    Loads feature CSVs from different sources, applies specific preprocessing 
    (specifically to Jade's IDs), merges them with labels, and saves the result.
    """
    
    # Load Data
    print("=" * 50)
    print("Loading Files...")
    print("=" * 50)
    
    try:
        labels = pd.read_csv(labels_path)
        e = pd.read_csv(eunice_path)
        n = pd.read_csv(nomin_path)
        p = pd.read_csv(prithvi_path)
        j = pd.read_csv(jade_path)
    except FileNotFoundError as err:
        print(f"Error loading files: {err}")
        return None

    # Fix Jade IDs (Specific Logic)
    print("Preprocessing Jade IDs...")
    g = j["id"].str.replace("id_train_", "", regex=False).astype(int)
    file_idx = (g // 65000).astype(int)
    j["id"] = g.astype(str) + "_train_" + file_idx.astype(str)

    # Preview Data    
    data_map = {
        "Labels": labels,
        "Eunice": e,
        "Nomin": n,
        "Prithvi": p,
        "Jade": j
    }

    for name, df in data_map.items():
        print(f"\n{name} — File Shape: {df.shape}")
        display(df.head())

    # Merge
    print("\n" + "=" * 50)
    print("Merging Datasets...")
    
    merged = (
        labels
        .merge(e, on="id", how="inner")
        .merge(n, on="id", how="inner")
        .merge(p, on="id", how="inner")
        .merge(j, on="id", how="inner")
    )

    print(f"Final Merged Shape: {merged.shape}")

    # Save Output
    os.makedirs(output_dir, exist_ok=True)
    out_path = os.path.join(output_dir, output_filename)
    
    print(f"Saving to {out_path}...")
    merged.to_csv(out_path, index=False, compression="gzip")
    print("Save Complete.")

    return merged

## Combine Training Files

In [8]:
paths_train = {
    "labels_path": "labels_train.csv",
    "eunice_path": "eunice_csv_files/train_csv/train_all_features.csv.gz",
    "nomin_path": "nomin_csv_files/combined_train_n.csv.gz",
    "prithvi_path": "prithvi_csv_files/train_2.csv.gz",
    "jade_path": "jade_csv_files/train_jade_features.csv"
}

merged_df_train = process_and_merge_features(
    **paths_train,
    output_filename="combined_train_with_labels.csv.gz")

merged_df_train.head()

Loading Files...
Preprocessing Jade IDs...

Labels — File Shape: (1040000, 7)


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0
0,0_train_0,582.364295,False,True,True,True,957
1,1_train_0,250.159995,False,True,True,True,948
2,2_train_0,1212.323954,False,True,False,True,965
3,3_train_0,240.87811,False,True,True,False,927
4,4_train_0,285.124189,False,True,True,False,958



Eunice — File Shape: (1040000, 6)


Unnamed: 0,id,ED,HWP,LQ80,PPR,SC
0,0_train_0,3409.0,2299.0,-717094.898532,0.719376,0.034655
1,1_train_0,3404.0,2446.0,-331957.541919,0.729709,0.035314
2,2_train_0,3411.0,2262.0,-425532.152706,0.71539,0.034915
3,3_train_0,3408.0,2833.0,-306980.459766,0.769375,0.034752
4,4_train_0,3406.0,2397.0,-362746.925366,0.728165,0.035132



Nomin — File Shape: (1040000, 9)


Unnamed: 0,id,current_skewness,spectral_centroid_power,tail_charge_diff,current_kurtosis,total_power,time_to_main_peak,time_to_peak,late_over_early
0,0_train_0,1.780811,107.276207,-0.598625,2.117825,1709302000.0,85,85,0.98791
1,1_train_0,1.756635,108.213621,-0.573984,2.058622,299137600.0,87,87,0.988301
2,2_train_0,1.079789,105.735183,-0.620023,-0.067003,624438500.0,95,95,0.987491
3,3_train_0,1.899438,107.946935,-0.571186,2.443885,281158300.0,116,116,0.98845
4,4_train_0,1.156442,106.350372,-0.563259,-0.17389,388544200.0,94,94,0.988541



Prithvi — File Shape: (1040000, 6)


Unnamed: 0,id,tdrift99,tfr,peak_count,gbn,bpr
0,0_train_0,85.0,0.142357,3,1.198436,0.059642
1,1_train_0,87.0,0.15116,3,1.381123,0.061975
2,2_train_0,95.0,0.142606,3,1.361857,0.050813
3,3_train_0,116.0,0.133192,2,1.165654,0.062954
4,4_train_0,94.0,0.145504,7,1.390918,0.055013



Jade — File Shape: (1040000, 5)


Unnamed: 0,id,AvsE,GradAreaRatio,GradWidthMain,HFER
0,0_train_0,0.519805,1.0,76.0,0.034772
1,1_train_0,0.531775,1.0,46.0,0.037621
2,2_train_0,0.343676,1.0,90.0,0.035552
3,3_train_0,0.483144,1.0,56.0,0.035093
4,4_train_0,0.377738,1.0,53.0,0.035971



Merging Datasets...
Final Merged Shape: (1040000, 29)
Saving to combined_csv_files/combined_train_with_labels.csv.gz...
Save Complete.


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2299.0,-717094.898532,...,0.98791,85.0,0.142357,3,1.198436,0.059642,0.519805,1.0,76.0,0.034772
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2446.0,-331957.541919,...,0.988301,87.0,0.15116,3,1.381123,0.061975,0.531775,1.0,46.0,0.037621
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2262.0,-425532.152706,...,0.987491,95.0,0.142606,3,1.361857,0.050813,0.343676,1.0,90.0,0.035552
3,3_train_0,240.87811,False,True,True,False,927,3408.0,2833.0,-306980.459766,...,0.98845,116.0,0.133192,2,1.165654,0.062954,0.483144,1.0,56.0,0.035093
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2397.0,-362746.925366,...,0.988541,94.0,0.145504,7,1.390918,0.055013,0.377738,1.0,53.0,0.035971


In [9]:
df_train = pd.read_csv('combined_csv_files/combined_train_with_labels.csv.gz')
df_train

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2299.0,-717094.898532,...,0.987910,85.0,0.142357,3,1.198436,0.059642,0.519805,1.000000,76.0,0.034772
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2446.0,-331957.541919,...,0.988301,87.0,0.151160,3,1.381123,0.061975,0.531775,1.000000,46.0,0.037621
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2262.0,-425532.152706,...,0.987491,95.0,0.142606,3,1.361857,0.050813,0.343676,1.000000,90.0,0.035552
3,3_train_0,240.878110,False,True,True,False,927,3408.0,2833.0,-306980.459766,...,0.988450,116.0,0.133192,2,1.165654,0.062954,0.483144,1.000000,56.0,0.035093
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2397.0,-362746.925366,...,0.988541,94.0,0.145504,7,1.390918,0.055013,0.377738,1.000000,53.0,0.035971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039995,1039995_train_15,210.834626,True,True,True,True,949,3403.0,2533.0,-278607.352936,...,0.985736,90.0,0.151747,6,1.008092,0.062825,0.577970,58.524510,57.0,0.034958
1039996,1039996_train_15,380.170340,True,True,True,True,949,3404.0,2426.0,-477777.966558,...,0.987814,79.0,0.150305,3,1.245492,0.062706,0.643747,80.265957,45.0,0.033676
1039997,1039997_train_15,370.109563,False,True,True,True,947,3409.0,2417.0,-466589.983952,...,0.987833,102.0,0.142950,2,1.149456,0.062153,0.480104,1.000000,59.0,0.035684
1039998,1039998_train_15,98.258524,True,True,True,False,921,3400.0,3799.0,-143212.214717,...,0.989948,92.0,0.153135,5,1.192639,0.065264,0.483468,1.000000,97.0,0.034234


In [10]:
df_train.describe()

Unnamed: 0,energy_label,tp0,ED,HWP,LQ80,PPR,SC,current_skewness,spectral_centroid_power,tail_charge_diff,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
count,1040000.0,1040000.0,1040000.0,1039995.0,1040000.0,1039995.0,1040000.0,1040000.0,1040000.0,1040000.0,...,1040000.0,1039995.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0,1040000.0
mean,638.1633,951.6434,3406.036,2423.961,1.583331e+124,0.7254552,0.03572992,1.906792,114.5299,-0.5468692,...,0.989365,113.1437,0.1495163,5.104236,1.777303,0.06049704,0.4964875,409133700000.0,67.92718,0.03643588
std,670.4271,18.95332,7.735526,392.52,1.614687e+127,0.02832694,0.006330801,0.5312883,106.9477,1.561871,...,0.1121635,72.87391,0.03987192,7.352031,1.174284,0.02152323,0.09173632,1401446000000.0,24.42059,0.008420477
min,0.0,71.0,3074.0,2.0,-5949322.0,-7.703542,0.02676319,-1.809541,89.86718,-8.692308,...,0.1302615,9.0,0.01968922,1.0,0.6992497,0.0363653,0.03150645,1.0,5.0,0.02650746
25%,238.3612,942.0,3404.0,2191.0,-777411.7,0.7088948,0.03452545,1.584701,106.7439,-0.6236,...,0.9873562,86.0,0.140839,1.0,1.116426,0.05533778,0.4353313,1.0,51.0,0.03454584
50%,341.1866,954.0,3407.0,2325.0,-427956.0,0.7211736,0.03480813,1.93916,108.6075,-0.5968707,...,0.9879051,102.0,0.1457415,2.0,1.32193,0.06133205,0.5268103,1.0,65.0,0.03505329
75%,630.0128,964.0,3410.0,2510.0,-306392.1,0.7361495,0.0352403,2.253204,110.3433,-0.5640222,...,0.988553,121.0,0.1509235,5.0,1.908089,0.06423424,0.5653386,65.88437,82.0,0.03580735
max,4998.508,1352.0,3790.0,3799.0,1.646664e+130,0.9768917,0.2028693,5.229496,16374.42,311.25,...,46.40625,2919.0,3.990051,55.0,36.67323,2.569411,0.9706884,7968905000000.0,391.0,0.2743541


## Combine Test Files

In [None]:
paths_test = {
    "labels_path": "labels_test.csv",
    "eunice_path": "eunice_csv_files/test_csv/...",
    "nomin_path": "nomin_csv_files/combined_test_n.csv.gz",
    "prithvi_path": "prithvi_csv_files/test_2.csv.gz",
    "jade_path": "jade_csv_files/test_jade_features.csv"
}

merged_df_test = process_and_merge_features(
    **paths_test,
    output_filename="combined_test_with_labels.csv.gz")

merged_df_test.head()

In [None]:
df_test = pd.read_csv('combined_csv_files/combined_test_with_labels.csv.gz')
df_test

In [None]:
df_test.describe()