In [1]:
import os
import pandas as pd

FEATURE_DIR = "."  # set to "finalcsveunice" if running from repo root

train_feature_files = [
    "ED_train_all.csv",
    "HWP_train_all.csv",
    "LQ80_train_all.csv",
    "PPR_train_all.csv",
    "SCA_train_all.csv",
    "ND80_train_all.csv"
]

# Load first file
df_train = pd.read_csv(os.path.join(FEATURE_DIR, train_feature_files[0]))
print("Loaded:", train_feature_files[0], df_train.shape)

# Merge remaining
for fname in train_feature_files[1:]:
    path = os.path.join(FEATURE_DIR, fname)
    temp = pd.read_csv(path)
    print("Merging:", fname, temp.shape)
    df_train = df_train.merge(temp, on="id", how="inner")

print("\nFinal merged TRAIN shape:", df_train.shape)
print(df_train.head())

# Save gzipped
train_out = os.path.join(FEATURE_DIR, "train_all_features.csv.gz")
df_train.to_csv(train_out, index=False, compression="gzip")

print("\nSaved gzipped TRAIN CSV to:")
print(os.path.abspath(train_out))


Loaded: ED_train_all.csv (1040000, 2)
Merging: HWP_train_all.csv (1040000, 2)
Merging: LQ80_train_all.csv (1040000, 2)
Merging: PPR_train_all.csv (1040000, 2)
Merging: SCA_train_all.csv (1040000, 2)
Merging: ND80_train_all.csv (1040000, 2)

Final merged TRAIN shape: (1040000, 7)
          id      ED     HWP           LQ80       PPR       SCA  ND80
0  0_train_0  3409.0  2120.0 -635333.796311  0.699672  0.034655   0.0
1  1_train_0  3404.0  2004.0 -289756.893085  0.687174  0.035314   0.0
2  2_train_0  3411.0  2125.0 -379843.029134  0.700985  0.034915   0.0
3  3_train_0  3408.0  2098.0 -252673.626844  0.697850  0.034752   0.0
4  4_train_0  3406.0  2037.0 -317761.453582  0.690585  0.035132   0.0

Saved gzipped TRAIN CSV to:
c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt\notebooks\finalcsveunice\train_all_features.csv.gz


In [2]:
import os
print(os.getcwd())


c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt\notebooks\finalcsveunice


In [9]:
import os
import pandas as pd

FEATURE_DIR = "."  # set to "finalcsveunice" if running from repo root

test_feature_files = [
    "ED_test_all.csv",
    "HWP_test_all.csv",
    "LQ80_test_all.csv",
    "PPR_test_all.csv",
    "SCA_test_all.csv",
    "ND80_test_all.csv"
]

# Load first file
df_test = pd.read_csv(os.path.join(FEATURE_DIR, test_feature_files[0]))
print("Loaded:", test_feature_files[0], df_test.shape)

# Merge remaining
for fname in test_feature_files[1:]:
    path = os.path.join(FEATURE_DIR, fname)
    temp = pd.read_csv(path)
    print("Merging:", fname, temp.shape)
    df_test = df_test.merge(temp, on="id", how="inner")

print("\nFinal merged TEST shape:", df_test.shape)
print(df_test.head())

# Save gzipped
test_out = os.path.join(FEATURE_DIR, "test_all_features.csv.gz")
df_test.to_csv(test_out, index=False, compression="gzip")

print("\nSaved gzipped TEST CSV to:")
print(os.path.abspath(test_out))


Loaded: ED_test_all.csv (390000, 2)
Merging: HWP_test_all.csv (390000, 2)
Merging: LQ80_test_all.csv (390000, 2)
Merging: PPR_test_all.csv (390000, 2)
Merging: SCA_test_all.csv (390000, 2)
Merging: ND80_test_all.csv (390000, 2)

Final merged TEST shape: (390000, 7)
               id      ED     HWP          LQ80       PPR       SCA  ND80
0  2395098_test_0  3407.0  2036.0 -1.300536e+06  0.692435  0.034359   0.0
1  2395099_test_0  3405.0  2019.0 -9.729822e+05  0.690517  0.034267   0.0
2  2395100_test_0  3412.0  2107.0 -6.390870e+05  0.700524  0.034659   0.0
3  2395101_test_0  3408.0  2053.0 -2.760460e+05  0.690450  0.034947   0.0
4  2395102_test_0  3406.0  1939.0 -7.611188e+04  0.677887  0.040895   0.0

Saved gzipped TEST CSV to:
c:\Users\YooNi\OneDrive\Desktop\Majorana-Neutrino-Hunt\notebooks\finalcsveunice\test_all_features.csv.gz


In [10]:
print("\nMISSING VALUE CHECK")

# ---- TRAIN ----
print("\nTRAIN dataset")
total_missing_train = df_train.isna().sum().sum()
print("Total missing values:", total_missing_train)

missing_by_col_train = df_train.isna().sum()
print("\nMissing values per column (TRAIN):")
print(missing_by_col_train[missing_by_col_train > 0])

if total_missing_train == 0:
    print("\nTRAIN: no missing values")
else:
    print("\nTRAIN: missing values detected")

# ---- TEST ----
print("\nTEST dataset")
total_missing_test = df_test.isna().sum().sum()
print("Total missing values:", total_missing_test)

missing_by_col_test = df_test.isna().sum()
print("\nMissing values per column (TEST):")
print(missing_by_col_test[missing_by_col_test > 0])

if total_missing_test == 0:
    print("\nTEST: no missing values")
else:
    print("\nTEST: missing values detected")



MISSING VALUE CHECK

TRAIN dataset
Total missing values: 0

Missing values per column (TRAIN):
Series([], dtype: int64)

TRAIN: no missing values

TEST dataset
Total missing values: 0

Missing values per column (TEST):
Series([], dtype: int64)

TEST: no missing values
