# Convert calibrated ROOT files to parquet

In [1]:
import sys
from pathlib import Path

import dotenv
import pandas as pd
from pandarallel import pandarallel

sys.path.append(str(Path(".").absolute().parent / "magicdl"))
import convert

dotenv.load_dotenv()
pandarallel.initialize(nb_workers=60, progress_bar=True)


INFO: Pandarallel will run on 60 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Gammas

In [2]:
gamma_dir = "/mnt/scratch/jgreen/MAGIC_MC/root/gammas/ST0307/za05to35/weak"
output_dir = "/mnt/scratch/jgreen/MAGIC_MC/DHBW-parquet/gammas"

gamma_dir = Path(gamma_dir).absolute()
gamma_m1 = gamma_dir / "Calibrated_M1"
gamma_m2 = gamma_dir / "Calibrated_M2"

output_dir = Path(output_dir).absolute()
output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
# find all the files in the m1 and m2 directories
gamma_m1_files = sorted(list(gamma_m1.glob("*.root")))
gamma_m2_files = sorted(list(gamma_m2.glob("*.root")))

# create a dataframe with the run number and the file paths
gamma_df = pd.DataFrame({
    "m1_file": gamma_m1_files,
    "m2_file": gamma_m2_files
})

gamma_df["run_number_m1"] = gamma_df.m1_file.apply(lambda x: int(x.stem.split("_")[4]))
gamma_df["run_number_m2"] = gamma_df.m2_file.apply(lambda x: int(x.stem.split("_")[4]))

# check that the run numbers are the same
assert (gamma_df.run_number_m1 == gamma_df.run_number_m2).all()

# drop the run number columns
gamma_df = gamma_df.drop(columns=["run_number_m2"]).rename(columns={"run_number_m1": "run_number"})

In [4]:
# Update the parallel processing call to include output_dir
results = gamma_df.parallel_apply(
    lambda row: convert.process_row(
        row,
        output_dir,
        run_number_col="run_number",
        m1_filepath_col="m1_file",
        m2_filepath_col="m2_file",
    ),
    axis=1,
)

# Print summary
print(f"Processed {len(results)} file pairs")
print(f"Output files saved to: {output_dir}")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=84), Label(value='0 / 84'))), HBox…

Processed 4999 file pairs
Output files saved to: /mnt/scratch/jgreen/MAGIC_MC/DHBW-parquet/gammas


In [5]:
gamma_filepath = "/mnt/scratch/jgreen/MAGIC_MC/DHBW-parquet/gammas"

g = pd.read_parquet(gamma_filepath)
g.to_parquet(Path(gamma_filepath).parent / "MAGIC_gammas.parquet")
print(f"Saved {len(g)} rows to {Path(gamma_filepath).parent / 'MAGIC_gammas.parquet'}")

## Protons

In [19]:
proton_dir = "/mnt/scratch/jgreen/MAGIC_MC/root/protons"
proton_output_dir = "/mnt/scratch/jgreen/MAGIC_MC/DHBW-parquet/protons"

proton_dir = Path(proton_dir).absolute()
proton_m1 = proton_dir / "calibrated_M1"
proton_m2 = proton_dir / "calibrated_M2"

proton_output_dir = Path(proton_output_dir).absolute()
proton_output_dir.mkdir(parents=True, exist_ok=True)

# find all the files in the m1 and m2 directories
proton_m1_files = sorted(list(proton_m1.glob("*.root")))
proton_m2_files = sorted(list(proton_m2.glob("*.root")))

# create a dataframe with the run number and the file paths
proton_df = pd.DataFrame({
    "m1_file": proton_m1_files,
    "m2_file": proton_m2_files
})

proton_df["run_number_m1"] = proton_df.m1_file.apply(lambda x: int(x.stem.split("_")[4]))
proton_df["run_number_m2"] = proton_df.m2_file.apply(lambda x: int(x.stem.split("_")[4]))

# check that the run numbers are the same
assert (proton_df.run_number_m1 == proton_df.run_number_m2).all()

# drop the run number columns
proton_df = proton_df.drop(columns=["run_number_m2"]).rename(columns={"run_number_m1": "run_number"})



In [21]:
proton_results = proton_df.parallel_apply(
    lambda row: convert.process_row(
        row,
        proton_output_dir,
        run_number_col="run_number",
        m1_filepath_col="m1_file",
        m2_filepath_col="m2_file",
    ),
    axis=1,
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=167), Label(value='0 / 167'))), HB…

In [None]:
proton_filepath = "/mnt/scratch/jgreen/MAGIC_MC/DHBW-parquet/protons"

p = pd.read_parquet(proton_filepath)
p.to_parquet(Path(proton_filepath).parent / "MAGIC_protons.parquet")
print(f"Saved {len(p)} rows to {Path(proton_filepath).parent / 'MAGIC_protons.parquet'}")

# Extract info + stats

In [2]:
gamma_file = "/remote/lstdata01/jgreen/MAGIC_MC/DHBW-parquet/magic-gammas.parquet"
proton_file = "/remote/lstdata01/jgreen/MAGIC_MC/DHBW-parquet/magic-protons.parquet"

g = pd.read_parquet(gamma_file)
p = pd.read_parquet(proton_file)

print(f"Gamma file has {len(g)} rows")
print(f"Proton file has {len(p)} rows")

Gamma file has 374069 rows
Proton file has 105174 rows


In [7]:
ex = g.iloc[0]

for i, col in zip(ex, g.columns):
    if hasattr(i, "__len__"):
        print(f"- {col:40}: len[{len(i)}]")
    else:
        print(f"- {col:40}: {i}")


- event_number                            : 8
- run_number                              : 821319
- true_energy                             : 256.48870849609375
- true_theta                              : 0.4074477553367615
- true_phi                                : 1.9105716943740845
- true_telescope_theta                    : 0.40294599533081055
- true_telescope_phi                      : 1.8970308303833008
- true_first_interaction_height           : 6084142.5
- hillas_length_m1                        : 38.546714782714844
- hillas_width_m1                         : 14.875171661376953
- hillas_delta_m1                         : 0.9979684352874756
- hillas_size_m1                          : 100.140625
- hillas_cog_x_m1                         : -49.29415512084961
- hillas_cog_y_m1                         : -127.39347839355469
- hillas_sin_delta_m1                     : 0.8403716087341309
- hillas_cos_delta_m1                     : 0.5420106649398804
- hillas_length_m2                  