In [5]:
from pathlib import Path
import pandas as pd
import re

# --- CHANGE THESE PATHS TO WHERE YOUR PER-FILE CSVs ACTUALLY LIVE ---
TRAIN_DIR = Path("data/params_train")
TEST_DIR  = Path("data/params_test")

OUT_DIR = Path("extracted_features_csv_files/prithvi_csv_files")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def file_no_from_name(path: Path) -> int:
    # works for names like: MJD_Train_0_myparams.csv  OR  Train_0.csv  OR  *_0_*.csv
    m = re.search(r"_(\d+)", path.stem)
    return int(m.group(1)) if m else 0

def merge_split(split_dir: Path, split_label: str, out_path: Path):
    files = sorted(split_dir.glob("*.csv"), key=file_no_from_name)
    if not files:
        raise FileNotFoundError(f"No CSV files found in {split_dir.resolve()}")

    frames = []
    global_idx = 0

    for f in files:
        fileno = file_no_from_name(f)
        df = pd.read_csv(f)

        # remove any old per-file identifiers
        original_id = df["id"].astype(str).str.strip()
        df["id"] = original_id + f"_{split_label}_{fileno}"
        for col in ["file", "filename"]:
            if col in df.columns:
                df = df.drop(columns=[col])
        frames.append(df)


    out_df = pd.concat(frames, ignore_index=True)
    out_df.to_csv(out_path, index=False)
    print(f"Saved {split_label} -> {out_path} | rows={len(out_df)} cols={len(out_df.columns)}")

merge_split(TRAIN_DIR, "train", OUT_DIR / "train_2.csv")
merge_split(TEST_DIR,  "test",  OUT_DIR / "test_2.csv")


Saved train -> extracted_features_csv_files/prithvi_csv_files/train_2.csv | rows=1040000 cols=6
Saved test -> extracted_features_csv_files/prithvi_csv_files/test_2.csv | rows=390000 cols=6


In [2]:
import os
os.getcwd()


'/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt/notebooks'

In [3]:
import os
os.chdir("/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt")


In [7]:
os.getcwd()



'/Users/prithvikochhar/Documents/GitHub/Majorana-Neutrino-Hunt'