Skip to content

Commit

Permalink
limit parquet file dimension
Browse files Browse the repository at this point in the history
  • Loading branch information
mchiusi committed Sep 27, 2023
1 parent 2d169dd commit 44e1f6d
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 5 deletions.
3 changes: 1 addition & 2 deletions bye_splits/data_handle/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ def EventDataParticle(particles, PU, tag, reprocess, debug=False, logger=None):
indata.adir = cfg["io"][PU][particles]["dir"]
indata.tree = cfg["io"][PU][particles]["tree"]

tag = particles + "_" + tag
tag = particles + "_" + PU + "_" + tag
tag += "_debug" * debug


return EventData(indata, tag, defevents, reprocess, logger)
12 changes: 11 additions & 1 deletion bye_splits/data_handle/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,17 @@ def select(self):
with up.open(self.indata.path, array_cache='550 MB', num_workers=8) as f:
tree = f[self.indata.tree_path]
allvars = set([y for x in self.var.values() for y in x.values()])
data = tree.arrays(filter_name='/' + '|'.join(allvars) + '/', entry_stop=200, library='ak')

threshold_size = 0.1
threshold_size_bytes = threshold_size * 8e+9
data = ak.Array([])
for array in tree.iterate(filter_name='/' + '|'.join(allvars) + '/', step_size='20 MB', library='ak'):
if (data.layout.nbytes + array.layout.nbytes) <= threshold_size_bytes:
data = ak.concatenate([data, array], axis=0)
else:
break

#data = tree.arrays(filter_name='/' + '|'.join(allvars) + '/', entry_stop=5000, library='ak')
# data[self.var.v] = data.waferv
# data[self.newvar.vs] = -1 * data.waferv
# data[self.newvar.c] = "#8a2be2"
Expand Down
2 changes: 1 addition & 1 deletion bye_splits/plot/display_plotly/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_data(self, pars, particles_str, event = ''):
self.list_events = [key.split('ev_')[1] for key in file.keys()]

event = event or self.random_event(self.filename)
if event not in self.list_events:
if str(event) not in self.list_events:
dict_ev, gen_info = run_radii_chain(pars, particles, PU, self.coefs, event)
self.process_event(dict_ev, gen_info)

Expand Down
2 changes: 1 addition & 1 deletion bye_splits/scripts/run_radii_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import pandas as pd

def run_radii_chain(pars, particles, PU, coefs, event=None):
df_gen, df_cl, df_tc = get_data_reco_chain_start(nevents=30, reprocess=True, particles=particles, PU=PU, event=event)
df_gen, df_cl, df_tc = get_data_reco_chain_start(nevents=30, reprocess=False, particles=particles, PU=PU, event=event)

fill_d = params.read_task_params("fill")
tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d)
Expand Down

0 comments on commit 44e1f6d

Please sign in to comment.