We want to look into data structure and how everything is stored. We have 495 parquet files. Each seperated into "mc_truth" and "photons". Each file contains 200 neutrino injection events. 

We look at "photons" first

In [None]:
import pandas as pd
import pprint

file_path = "/ceph/work/SATORI/alex/sim_new/sim_3/output/hits/photon_000.parquet"
df = pd.read_parquet(file_path)

pp = pprint.PrettyPrinter(width=200, compact=False)

for i in range(len(df)):
    print(f"\n===== EVENT {i} =====")
    pp.pprint(df["photons"].iloc[i])


Now we look at "mc_truth". We can see true energy in here. That will be our y

In [None]:
import pandas as pd
import pprint
                                                                                                                                                                    
file_path = "/ceph/work/SATORI/alex/sim_new/sim_3/output/hits/photon_000.parquet"
df = pd.read_parquet(file_path)

pp = pprint.PrettyPrinter(width=200, compact=False)

for i in range(len(df)):
    print(f"\n===== EVENT {i} =====")
    pp.pprint(df["mc_truth"].iloc[i])


We want to know how many hits we have and what's the spread of the hits. Good info to have. 

In [None]:
#!/usr/bin/env python3
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

DATA_DIR = "/ceph/work/SATORI/alex/sim_new/sim_3/output/hits"
PATTERN = "photon_*.parquet"

def count_hits():
    files = sorted(glob.glob(os.path.join(DATA_DIR, PATTERN)))
    if not files:
        print("No parquet files found.")
        return

    total_events = 0
    events_with_hits = 0
    total_hits = 0
    hits_per_event = []

    for f in tqdm(files, desc="Processing files"):
        df = pd.read_parquet(f, columns=["photons"])
        for photons in df["photons"]:
            total_events += 1

            n_hits = len(photons["t"])
            hits_per_event.append(n_hits)

            if n_hits > 0:
                events_with_hits += 1
                total_hits += n_hits

    hits_per_event = np.array(hits_per_event)

    print("\n===== SUMMARY =====")
    print("Total events:", total_events)
    print("Events with â‰¥1 hit:", events_with_hits)
    print("Fraction with hits:", events_with_hits / total_events)
    print("Total photon hits:", total_hits)
    print("Average hits per event:", hits_per_event[hits_per_event > 0].mean() if events_with_hits > 0 else 0)
    print("Median hits per event:", np.median(hits_per_event[hits_per_event > 0]) if events_with_hits > 0 else 0)

    # Optional: histogram
    hist, bin_edges = np.histogram(hits_per_event, bins=[0,1,10,50,100,500,1000,5000,10000])
    print("\nHits per event histogram (bin counts):")
    for b_start, b_end, count in zip(bin_edges[:-1], bin_edges[1:], hist):
        print(f"{int(b_start):5d} - {int(b_end):5d}: {count}")

if __name__ == "__main__":
    count_hits()


To build the CNN, we need to decide the size of our tensor. let's look at the time spread of our hits to determine this. 

In [None]:
import numpy as np
import pandas as pd
import glob, os
from tqdm import tqdm

DATA_DIR = "/ceph/work/SATORI/alex/sim_new/sim_3/output/hits"
files = sorted(glob.glob(os.path.join(DATA_DIR, "photon_*.parquet")))

durations = []

for f in tqdm(files[:499]): 
    df = pd.read_parquet(f, columns=["photons"])
    for ph in df["photons"]:
        t = ph["t"]
        if len(t) > 0:
            durations.append(t.max() - t.min())

durations = np.array(durations)
print("Number of events with hits:", len(durations))
print("Min duration:", durations.min())
print("Max duration:", durations.max())
print("Mean duration:", durations.mean())
print("Median duration:", np.median(durations))
print("95th percentile duration:", np.percentile(durations, 95))
print("99th percentile duration:", np.percentile(durations, 99))


We see that 99th percdntile is 3348, everything greater are extream cases. We choose 3500 as T_max