In [1]:
import json
from collections import defaultdict
import ROOT

OBJ: TStyle	ildStyle	ILD Style : 0 at: 0x7799a90


In [2]:
ROOT.EnableImplicitMT(12)

In [3]:
def process_path(path: str) -> str:
    dir, _, fname = path.rpartition("/")
    # the mc-2020 filenames follow a certain fixed naming scheme, example:
    # rv02-02.sv02-02.mILD_l5_o1_v02.E250-SetA.I500102.P4f_sze_sl.eL.pR.n024.d_dstm_15180_122_mini-DST.edm4hep.root
    # <reco v>.<sim v>.<det model>.<machine setting>.<some id>.<process name>.<e pol>.<p pol>.<... other stuff
    parts = fname.split(".")
    process_name = parts[5].lstrip("P")
    e_pol = parts[6]
    p_pol = parts[7]
    return process_name, e_pol, p_pol

In [4]:
from collections import namedtuple
Stats = namedtuple("Stats", ["int_lumi", "n_events"])


class Dataset:
    """Initialise a dataset from a list of file paths in ILD mc-2020 production notation"""
    # we want to have something like {"process": {"pol: [path1, path2]"}}
    _dataset = defaultdict(lambda: defaultdict(list))

    # TODO use better typed input
    def __init__(self, input_path: str):
        with open(input_path) as file:
            for line in file:
                path = line.strip()
                self.add_file(path)


    def get_dataset(self):
        # convert back to regular dict
        return dict(self._dataset)

    # you probably do not want to call this yourself
    # I will probably put in stuff soon that assumes to only be called once the Dataset is complete
    # but no explicit checks...
    def add_file(self, path: str):
        process_name, e_pol, p_pol = process_path(path)
        # do basically dataset[process_name][pol].append(path)
        # but we may have to create the pol dict and the path list
        # pol_dict = self.dataset.setdefault(process_name, {})
        # path_list = pol_dict.setdefault(pol, [])
        # path_list.append(path)
        self._dataset[process_name][e_pol + p_pol].append(path)

    def get_files(self, process_name: str, pol: str) -> list[str]:
        # use the getter to not have a defaultdict anymore and just let the dict throw
        return self.get_dataset()[process_name][pol]

    def get_keys(self):
        return [(process_name, pol) for process_name in self._dataset.keys() for pol in self._dataset[process_name].keys()]

    def get_merged_keys(self):
        return [f"{process_name}_{pol}" for process_name in self._dataset.keys() for pol in self._dataset[process_name].keys()]

class Analysis:
    """Holds a Dataset, all the data frames and possibly also all the histograms"""

    # _df: dict[str, ROOT.RDataFrame]
    _df = {}
    _stats: dict[str, Stats] = {}

    def __init__(self, dataset: Dataset):
        self._dataset = dataset
        for process, pol in dataset.get_keys():
            files = dataset.get_files(process, pol)
            df = ROOT.RDataFrame("events", files)
            self._df[f"{process}_{pol}"] = df

    def Define(self, *args):
        for k in self._df:
            self._df[k] = self._df[k].Define(*args)


    def init_parameters(self, params: list[tuple[str, str]]):
        """Inits the podio generic parameters supplied as a list of (name, c++ typename)"""
        ROOT.gInterpreter.Declare("#include <podio/GenericParameters.h>")
        self.Define("Parameters", "podio::GenericParameters par; par.loadFrom(GPDoubleKeys, GPDoubleValues); par.loadFrom(GPFloatKeys, GPFloatValues); par.loadFrom(GPIntKeys, GPIntValues); par.loadFrom(GPStringKeys, GPStringValues); return par;")
        for p_name, p_type in params:
            self.Define(f"params_{p_name.replace('.', '_')}", f"Parameters.get<{p_type}>(\"{p_name}\").value()")


    # TODO: needs initParameters to be called first and with ("crossSection", "float") in the parameter list
    def book_statistics(self):
        self.Define("lumi", "1.0 / params_crossSection")
        # loop over all df, maybe also wrap this in the future
        for k in self._df:
            df = self._df[k]
            lumi = df.Sum("lumi")
            count = df.Count()
            self._stats[k] = Stats(int_lumi=lumi, n_events=count)


    def print_stats(self):
        for k in self._stats:
            stats = self._stats[k]
            print(f"process: {k} events: {stats.n_events.GetValue()} int lumi: {stats.int_lumi.GetValue()}")


    def get_keys(self):
        return self._dataset.get_merged_keys()

    # TODO: Add a method to store selected histograms to a root file
    # TODO: Add a method to snapshot parts of the data frames?





In [5]:
dataset = Dataset("data/locations/miniDSTs/processed-no-higgs.txt")

print(dataset.get_keys())

print(json.dumps(dataset.get_dataset(), indent=2))

# print(dataset.get_files("4f_sw_sl", "eLpR"))

[('2f_z_eehiq', 'eLpL'), ('2f_z_eehiq', 'eLpR'), ('2f_z_eehiq', 'eRpR'), ('2f_z_eehiq', 'eRpL'), ('2f_z_h', 'eLpR'), ('2f_z_h', 'eRpL'), ('2f_z_l', 'eLpR'), ('2f_z_l', 'eRpL'), ('ea_3f_z_l', 'eLpB'), ('ea_3f_z_l', 'eRpB'), ('ea_3f_z_nu', 'eLpB'), ('ea_3f_z_nu', 'eRpB'), ('ea_3f_w_l', 'eLpB'), ('ea_3f_z_h', 'eLpB'), ('ea_3f_z_h', 'eRpB'), ('ea_3f_w_h', 'eLpB'), ('ae_3f_z_l', 'eBpL'), ('ae_3f_z_l', 'eBpR'), ('ae_3f_z_nu', 'eBpL'), ('ae_3f_z_nu', 'eBpR'), ('ae_3f_w_l', 'eBpR'), ('ae_3f_z_h', 'eBpL'), ('ae_3f_z_h', 'eBpR'), ('ae_3f_w_h', 'eBpR'), ('4f_zz_h', 'eLpR'), ('4f_zz_h', 'eRpL'), ('4f_zz_sl', 'eLpR'), ('4f_zz_sl', 'eRpL'), ('4f_zz_l', 'eLpR'), ('4f_zz_l', 'eRpL'), ('4f_zzorww_h', 'eLpR'), ('4f_zzorww_h', 'eRpL'), ('4f_zzorww_l', 'eLpR'), ('4f_zzorww_l', 'eRpL'), ('4f_zznu_sl', 'eLpR'), ('4f_zznu_sl', 'eRpL'), ('4f_zznu_l', 'eLpR'), ('4f_zznu_l', 'eRpL'), ('4f_sznu_l', 'eLpR'), ('4f_sznu_l', 'eRpL'), ('4f_sze_sl', 'eLpL'), ('4f_sze_sl', 'eLpR'), ('4f_sze_sl', 'eRpR'), ('4f_sze_sl', 

In [6]:
analysis = Analysis(dataset)

analysis.init_parameters([("crossSection", "float")])
analysis.book_statistics()

In [7]:
analysis.print_stats()

process: 2f_z_eehiq_eLpL events: 302400 int lumi: 1.5179331024255822
process: 2f_z_eehiq_eLpR events: 378000 int lumi: 1.9574107805175487
process: 2f_z_eehiq_eRpR events: 259200 int lumi: 1.3010855163647848
process: 2f_z_eehiq_eRpL events: 302400 int lumi: 1.5872568053176253
process: 2f_z_h_eLpR events: 243200 int lumi: 1.9005117833244647
process: 2f_z_h_eRpL events: 116800 int lumi: 1.6586964459246696
process: 2f_z_l_eLpR events: 392000 int lumi: 18.478361643699913
process: 2f_z_l_eRpL events: 311000 int lumi: 19.006244779405954
process: ea_3f_z_l_eLpB events: 132000 int lumi: 1.3510390670665835
process: ea_3f_z_l_eRpB events: 184000 int lumi: 1.8856540414100902
process: ea_3f_z_nu_eLpB events: 40000 int lumi: 1051.8161195562932
process: ea_3f_z_nu_eRpB events: 30000 int lumi: 1426.843801705448
process: ea_3f_w_l_eLpB events: 40000 int lumi: 1265.0720875952777
process: ea_3f_z_h_eLpB events: 98400 int lumi: 11.286431093601514
process: ea_3f_z_h_eRpB events: 119000 int lumi: 13.7515447