In [1]:
import sys
import urllib.request
from pathlib import Path
from tqdm import tqdm
import numpy as np
import uproot
import awkward as ak
import h5py

In [2]:
# prepare the dir for raw data
cwd = Path.cwd()
data_raw_dir = cwd.joinpath("test_data/raw")
data_raw_dir.mkdir(parents=True, exist_ok=True)

# get the list of root files
trainList_url = "http://opendata.cern.ch/record/12102/files/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC_test_root_file_index.txt"
testList_url = "http://opendata.cern.ch/record/12102/files/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC_train_root_file_index.txt"
trainList = urllib.request.urlopen(trainList_url).read().decode('utf-8').split("\n")
testList = urllib.request.urlopen(testList_url).read().decode('utf-8').split("\n")
# rm empty entry
trainList = [url for url in trainList if url]
testList = [url for url in testList if url]


In [3]:
url = testList[0]

In [4]:
# prepare target file
h5_name = url.rsplit('/', 1)[1].split('.')[0]+'.h5'
h5_path = data_raw_dir.joinpath(h5_name)  
h5_f = h5py.File(str(h5_path), 'w')

# open and read root files online
root_f = uproot.open(url)
tree = root_f['deepntuplizer/tree']
keys = tree.keys()

# prepare number cuts on pfcand, track, and sv
ncuts = {}
ncuts['pfcand']=100
ncuts['track']=60
ncuts['sv']=5

# fill branches into dataset
for k in keys:
    try:
        # get maximum number
        arr = tree[k].array(library='ak')
        if arr.ndim == 1:
            h5_f.create_dataset(k, data=arr)
        else:
            maxobj = ak.max(ak.num(arr, axis=1))
            special_keys = ncuts.keys()
            for sk in special_keys:
                if k in sk:
                    maxobj = ncuts[sk]
            # zero padding
            arr = ak.pad_none(arr, target=maxobj, clip=True)
            arr = ak.fill_none(arr, 0).to_numpy().astype(np.float16)
            h5_f.create_dataset(k, data=arr)
    except Exception as e:
        print(arr.ndim)
        print(arr.dtype)
        print(f"{k} has an error {e}")
        
# close the file
h5_f.close()

Plugin No such file or directory loading sec.protocol libXrdSeckrb5-5.so
