In [1]:
import sys
import urllib.request
from pathlib import Path
from tqdm import tqdm
import numpy as np
import uproot
import awkward as ak
import h5py

In [2]:
# prepare the dir for raw data
cwd = Path.cwd()
data_raw_dir = cwd.joinpath("test_data/raw")
data_raw_dir.mkdir(parents=True, exist_ok=True)

# get the list of root files
trainList_url = "http://opendata.cern.ch/record/12102/files/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC_test_root_file_index.txt"
testList_url = "http://opendata.cern.ch/record/12102/files/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC_train_root_file_index.txt"
trainList = urllib.request.urlopen(trainList_url).read().decode('utf-8').split("\n")
testList = urllib.request.urlopen(testList_url).read().decode('utf-8').split("\n")
# rm empty entry
trainList = [url for url in trainList if url]
testList = [url for url in testList if url]


In [3]:
url = testList[0]

In [4]:
# # prepare target file
# h5_name = url.rsplit('/', 1)[1].split('.')[0]+'.h5'
# h5_path = data_raw_dir.joinpath(h5_name)  
# h5_f = h5py.File(str(h5_path), 'w')

# # open and read root files online
# root_f = uproot.open(url)
# tree = root_f['deepntuplizer/tree']
# keys = tree.keys()

# # prepare number cuts on pfcand, track, and sv
# ncuts = {}
# ncuts['pfcand']=100
# ncuts['track']=60
# ncuts['sv']=5

# # fill branches into dataset
# for k in keys:
#     try:
#         # get maximum number
#         arr = tree[k].array(library='ak')
#         if arr.ndim == 1:
#             h5_f.create_dataset(k, data=arr)
#         else:
#             maxobj = ak.max(ak.num(arr, axis=1))
#             special_keys = ncuts.keys()
#             for sk in special_keys:
#                 if k in sk:
#                     maxobj = ncuts[sk]
#             # zero padding
#             arr = ak.pad_none(arr, target=maxobj, clip=True)
#             arr = ak.fill_none(arr, 0).to_numpy().astype(np.float16)
#             h5_f.create_dataset(k, data=arr)
#     except Exception as e:
#         print(arr.ndim)
#         print(arr.dtype)
#         print(f"{k} has an error {e}")
        
# # close the file
# h5_f.close()

New design after adopting the subgroups

In [5]:
ftrs_0 = ['fj_jetNTracks',
                          'fj_nSV',
                          'fj_tau0_trackEtaRel_0',
                          'fj_tau0_trackEtaRel_1',
                          'fj_tau0_trackEtaRel_2',
                          'fj_tau1_trackEtaRel_0',
                          'fj_tau1_trackEtaRel_1',
                          'fj_tau1_trackEtaRel_2',
                          'fj_tau_flightDistance2dSig_0',
                          'fj_tau_flightDistance2dSig_1',
                          'fj_tau_vertexDeltaR_0',
                          'fj_tau_vertexEnergyRatio_0',
                          'fj_tau_vertexEnergyRatio_1',
                          'fj_tau_vertexMass_0',
                          'fj_tau_vertexMass_1',
                          'fj_trackSip2dSigAboveBottom_0',
                          'fj_trackSip2dSigAboveBottom_1',
                          'fj_trackSip2dSigAboveCharm_0',
                          'fj_trackSipdSig_0',
                          'fj_trackSipdSig_0_0',
                          'fj_trackSipdSig_0_1',
                          'fj_trackSipdSig_1',
                          'fj_trackSipdSig_1_0',
                          'fj_trackSipdSig_1_1',
                          'fj_trackSipdSig_2',
                          'fj_trackSipdSig_3',
                          'fj_z_ratio'
                          ]
ftrs_1 = ['pfcand_ptrel',
                          'pfcand_erel',
                          'pfcand_phirel',
                          'pfcand_etarel',
                          'pfcand_deltaR',
                          'pfcand_puppiw',
                          'pfcand_drminsv',
                          'pfcand_drsubjet1',
                          'pfcand_drsubjet2',
                          'pfcand_hcalFrac']
ftrs_2 = ['track_ptrel',     
                          'track_erel',     
                          'track_phirel',     
                          'track_etarel',     
                          'track_deltaR',
                          'track_drminsv',     
                          'track_drsubjet1',     
                          'track_drsubjet2',
                          'track_dz',     
                          'track_dzsig',     
                          'track_dxy',     
                          'track_dxysig',     
                          'track_normchi2',     
                          'track_quality',     
                          'track_dptdpt',     
                          'track_detadeta',     
                          'track_dphidphi',     
                          'track_dxydxy',     
                          'track_dzdz',     
                          'track_dxydz',     
                          'track_dphidxy',     
                          'track_dlambdadz',     
                          'trackBTag_EtaRel',     
                          'trackBTag_PtRatio',     
                          'trackBTag_PParRatio',     
                          'trackBTag_Sip2dVal',     
                          'trackBTag_Sip2dSig',     
                          'trackBTag_Sip3dVal',     
                          'trackBTag_Sip3dSig',     
                          'trackBTag_JetDistVal']
ftrs_3 = ['sv_ptrel',
                          'sv_erel',
                          'sv_phirel',
                          'sv_etarel',
                          'sv_deltaR',
                          'sv_pt',
                          'sv_mass',
                          'sv_ntracks',
                          'sv_normchi2',
                          'sv_dxy',
                          'sv_dxysig',
                          'sv_d3d',
                          'sv_d3dsig',
                          'sv_costhetasvpv']

In [6]:
spectators = ['fj_pt',
                          'fj_eta',
                          'fj_sdmass',
                          'fj_n_sdsubjets',
                          'fj_doubleb',
                          'fj_tau21',
                          'fj_tau32',
                          'npv',
                          'npfcands',
                          'ntracks',
                          'nsv'
                      ]

In [7]:
targets = ['sample_isQCD','fj_isQCD','fj_isH']

In [8]:
test_dir = data_raw_dir.joinpath('test')
test_dir.mkdir(exist_ok=True, parents=True)
h5_path = test_dir.joinpath('test.h5')
h5_f = h5py.File(h5_path, 'w')
training_data = h5_f.create_group("training_subgroup")
target_data = h5_f.create_group("target_subgroup")
spec_data = h5_f.create_group("spectator_subgroup")

In [9]:
url = testList[0]
# open and read root files online
root_f = uproot.open(url)
tree = root_f['deepntuplizer/tree']
tree.show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
Delta_gen_pt         | float                    | AsDtype('>f4')
event_no             | uint32_t                 | AsDtype('>u4')
gen_pt               | float                    | AsDtype('>f4')
isB                  | int32_t                  | AsDtype('>i4')
isBB                 | int32_t                  | AsDtype('>i4')
isC                  | int32_t                  | AsDtype('>i4')
isG                  | int32_t                  | AsDtype('>i4')
isLeptonicB          | int32_t                  | AsDtype('>i4')
isLeptonicB_C        | int32_t                  | AsDtype('>i4')
isS                  | int32_t                  | AsDtype('>i4')
isUD                 | int32_t                  | AsDtype('>i4')
isUndefined          | int32_t                  | AsDtype('>i4')
jet_corr_pt          | float                    | AsDtype(

Creating dataset for training feature group 0

In [10]:
ftrs_0_arr = tree.arrays(ftrs_0)
ftrs_0_arr = ak.values_astype(ftrs_0_arr, float)
training_data.create_dataset("training_0", data=ftrs_0_arr)

<HDF5 dataset "training_0": shape (200000,), type "|V216">

Creating dataset for training feature group 1

In [11]:
ftrs_1_ak = tree.arrays(ftrs_1, library='ak')
ftrs_1_ak = ak.pad_none(ftrs_1_ak, axis=1, target=100, clip=True)
ftrs_1_ak = ak.fill_none(ftrs_1_ak, axis=1, value=0.0)

ftrs_1_arr = np.zeros(shape=(200000, len(ftrs_1), 100), dtype=float)
for i, field in enumerate(ftrs_1_ak.fields):
    ftrs_1_arr[:,i] = ftrs_1_ak[field].to_numpy()

In [12]:
training_data.create_dataset("training_1", data=ftrs_1_arr)

<HDF5 dataset "training_1": shape (200000, 10, 100), type "<f8">

Creating dataset for training feature group 2

In [13]:
ftrs_2_ak = tree.arrays(ftrs_2, library='ak')
ftrs_2_ak = ak.pad_none(ftrs_2_ak, axis=1, target=60, clip=True)
ftrs_2_ak = ak.fill_none(ftrs_2_ak, axis=1, value=0.0)

ftrs_2_arr = np.zeros(shape=(200000, len(ftrs_2), 60), dtype=float)
for i, field in enumerate(ftrs_2_ak.fields):
    ftrs_2_arr[:,i] = ftrs_2_ak[field].to_numpy()

OSError: XRootD error: [ERROR] Operation expired
in file root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/train/ntuple_merged_10.root

In [None]:
training_data.create_dataset("training_2", data=ftrs_2_arr)

Creating dataset for training feature group 3

In [None]:
ftrs_3_ak = tree.arrays(ftrs_3, library='ak')
ftrs_3_ak = ak.pad_none(ftrs_3_ak, axis=1, target=5, clip=True)
ftrs_3_ak = ak.fill_none(ftrs_3_ak, axis=1, value=0.0)

ftrs_3_arr = np.zeros(shape=(200000, len(ftrs_3), 5), dtype=float)
for i, field in enumerate(ftrs_3_ak.fields):
    ftrs_3_arr[:,i] = ftrs_3_ak[field].to_numpy()

In [None]:
training_data.create_dataset("training_3", data=ftrs_3_arr)

Creating dataset for spectator

In [None]:
specs = ['fj_pt',
                          'fj_eta',
                          'fj_sdmass',
                          'fj_n_sdsubjets',
                          'fj_doubleb',
                          'fj_tau21',
                          'fj_tau32',
                          'npv',
                          'npfcands',
                          'ntracks',
                          'nsv'
                      ]

In [None]:
specs_arr = tree.arrays(specs)
specs_arr = ak.values_astype(specs_arr, float)
spec_data.create_dataset("spectators", data=specs_arr)

Creating dataset for target