In [1]:
import os
import glob
import h5py

def is_empty_event(file_path):
    with h5py.File(file_path, 'r') as h5_file:
        event_vars_group = h5_file.get('EventVars')
        if event_vars_group and 'normweight' in event_vars_group and len(event_vars_group['normweight']) == 0:
            return True
    return False
    
def copy_items(name, obj, destination_group):
    if isinstance(obj, h5py.Group):
        if name not in destination_group:
            new_group = destination_group.create_group(name)
            obj.visititems(lambda subname, subobj: copy_items(subname, subobj, new_group))
    elif isinstance(obj, h5py.Dataset):

        if name in destination_group:
            existing_dataset = destination_group[name]
            existing_shape = existing_dataset.shape
            existing_dataset.resize((existing_shape[0] + obj.shape[0],) + existing_shape[1:])
            existing_dataset[-obj.shape[0]:] = obj[:]
        else:
            chunks = (1,) + obj.shape[1:]
            destination_group.create_dataset(name, shape=obj.shape, dtype=obj.dtype, chunks=chunks, maxshape=(None,) + obj.shape[1:], data=obj[:])

def concatenate_h5_in_folder(output_file, folder='unsupervised-search/QCD_HT_h5s'):
    files_to_concatenate = glob.glob(os.path.join(folder, '*.h5'))


    with h5py.File(output_file, 'w') as output_h5:
        for file_path in files_to_concatenate:
            print(file_path)
            #if not is_empty_event(file_path):
            with h5py.File(file_path, 'r') as input_h5:
                input_normweight = input_h5.get('EventVars/normweight')
                if input_normweight is not None and len(input_normweight) > 0:
                    input_h5.visititems(lambda name, obj: copy_items(name, obj, output_h5))

output_file_path = 'unsupervised-search/combined_QCD.h5'
concatenate_h5_in_folder(output_file_path)


unsupervised-search/QCD_HT_h5s/combined_out_QCD_HT700to1000.h5
unsupervised-search/QCD_HT_h5s/combined_out_QCD_HT1000to1500.h5
unsupervised-search/QCD_HT_h5s/combined_out_QCD_HT1500to2000.h5
unsupervised-search/QCD_HT_h5s/combined_out_QCD_HT2000toInf.h5


In [8]:
import os
import glob
import h5py

def is_empty_event(file_path):
    with h5py.File(file_path, 'r') as h5_file:
        event_vars_group = h5_file.get('EventVars')
        if event_vars_group and 'normweight' in event_vars_group and len(event_vars_group['normweight']) == 0:
            return True
    return False
    
def copy_items(name, obj, destination_group):
    if isinstance(obj, h5py.Group):
        if name not in destination_group:
            new_group = destination_group.create_group(name)
            obj.visititems(lambda subname, subobj: copy_items(subname, subobj, new_group))
    elif isinstance(obj, h5py.Dataset):
        if name in destination_group:
            existing_dataset = destination_group[name]
            existing_shape = existing_dataset.shape
            existing_dataset.resize((existing_shape[0] + obj.shape[0],) + existing_shape[1:])
            existing_dataset[-obj.shape[0]:] = obj[:]
        else:
            chunks = (1,) + obj.shape[1:]
            destination_group.create_dataset(name, shape=obj.shape, dtype=obj.dtype, chunks=chunks, maxshape=(None,) + obj.shape[1:], data=obj[:])

def has_pt_above_threshold(pt_dataset, threshold, min_jets):
    above_threshold_indices = pt_dataset[:] > threshold
    return above_threshold_indices.sum() >= min_jets

def concatenate_h5_in_folder(output_file, folder='unsupervised-search/QCD_HT_h5s', threshold=2, min_jets=4):
    files_to_concatenate = glob.glob(os.path.join(folder, '*.h5'))

    with h5py.File(output_file, 'w') as output_h5:
        for file_path in files_to_concatenate:
            print(file_path)
            with h5py.File(file_path, 'r') as input_h5:
                input_pt_dataset = input_h5.get('source/pt')
                if input_pt_dataset is not None and has_pt_above_threshold(input_pt_dataset, threshold, min_jets):
                    input_h5.visititems(lambda name, obj: copy_items(name, obj, output_h5))

output_file_path = 'unsupervised-search/combined_QCD_400k.h5'
concatenate_h5_in_folder(output_file_path)


unsupervised-search/QCD_HT_h5s/combined_out_QCD_HT700to1000.h5


TypeError: Accessing a group is done with bytes or str, not <class 'numpy.ndarray'>

In [9]:
import os
import glob
import h5py

def is_empty_event(file_path):
    with h5py.File(file_path, 'r') as h5_file:
        event_vars_group = h5_file.get('EventVars')
        if event_vars_group and 'normweight' in event_vars_group and len(event_vars_group['normweight']) == 0:
            return True
    return False
    
def copy_items(name, obj, destination_group):
    if isinstance(obj, h5py.Group):
        if name not in destination_group:
            new_group = destination_group.create_group(name)
            obj.visititems(lambda subname, subobj: copy_items(subname, subobj, new_group, max_entries))
    elif isinstance(obj, h5py.Dataset):
        if name in destination_group:
            existing_dataset = destination_group[name]
            existing_shape = existing_dataset.shape
            new_shape = (existing_shape[0] + min(obj.shape[0], max_entries),) + existing_shape[1:]
            existing_dataset.resize(new_shape)
            existing_dataset[-min(obj.shape[0], max_entries):] = obj[:min(obj.shape[0], max_entries)]
        else:
            chunks = (1,) + obj.shape[1:]
            new_shape = (min(obj.shape[0], max_entries),) + obj.shape[1:]
            destination_group.create_dataset(name, shape=new_shape, dtype=obj.dtype, chunks=chunks, maxshape=(None,) + obj.shape[1:], data=obj[:min(obj.shape[0], max_entries)])

def has_pt_above_threshold(pt_dataset, threshold, min_jets):
    above_threshold_indices = pt_dataset[:] > threshold
    return above_threshold_indices.sum() >= min_jets

def concatenate_h5_in_folder(output_file, folder='slimmed_ntuples/signal1500_h5/', threshold=2, min_jets=4, num_files=100):
    files_to_concatenate = glob.glob(os.path.join(folder, '*.h5'))

    with h5py.File(output_file, 'w') as output_h5:
        for file_path in files_to_concatenate:
            print(file_path)
            with h5py.File(file_path, 'r') as input_h5:
                # input_pt_dataset = input_h5.get('source/pt')
                # if input_pt_dataset is not None and has_pt_above_threshold(input_pt_dataset, threshold, min_jets):
                input_h5.visititems(lambda name, obj: copy_items(name, obj, output_h5, max_entries))

output_file_path = 'slimmed_ntuples/signal1500.h5'
concatenate_h5_in_folder(output_file_path)


slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_0.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_100.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_101.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_102.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_103.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_104.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_105.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_106.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_107.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_108.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_109.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_10.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_110.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_111.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_112.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_113.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_114.h5
slimmed_ntuples/signal1500_h5/slimmedNtup_gl1500_11

In [8]:
import os
import glob
import h5py

def is_empty_event(file_path):
    with h5py.File(file_path, 'r') as h5_file:
        event_vars_group = h5_file.get('EventVars')
        if event_vars_group and 'normweight' in event_vars_group and len(event_vars_group['normweight']) == 0:
            return True
    return False
    
def copy_items(name, obj, destination_group):
    if isinstance(obj, h5py.Group):
        if name not in destination_group:
            new_group = destination_group.create_group(name)
            obj.visititems(lambda subname, subobj: copy_items(subname, subobj, new_group))
    elif isinstance(obj, h5py.Dataset):
        if name in destination_group:
            existing_dataset = destination_group[name]
            existing_shape = existing_dataset.shape
            new_shape = (existing_shape[0] + obj.shape[0],) + existing_shape[1:]
            existing_dataset.resize(new_shape)
            existing_dataset[-obj.shape[0]:] = obj[:]
        else:
            chunks = (1,) + obj.shape[1:]
            destination_group.create_dataset(name, data=obj[:], chunks=chunks, maxshape=(None,) + obj.shape[1:])

def has_pt_above_threshold(pt_dataset, threshold, min_jets):
    above_threshold_indices = pt_dataset[:] > threshold
    return above_threshold_indices.sum() >= min_jets

def concatenate_h5_in_folder(output_file, folder='slimmed_ntuples/signal700_h5/', threshold=2, min_jets=4, num_files=8):
    files_to_concatenate = glob.glob(os.path.join(folder, '*.h5'))
    files_to_concatenate = files_to_concatenate[:num_files]  # Limit the number of files to concatenate
    
    with h5py.File(output_file, 'w') as output_h5:
        for file_path in files_to_concatenate:
            print(file_path)
            with h5py.File(file_path, 'r') as input_h5:
                # input_pt_dataset = input_h5.get('source/pt')
                # if input_pt_dataset is not None and has_pt_above_threshold(input_pt_dataset, threshold, min_jets):
                input_h5.visititems(lambda name, obj: copy_items(name, obj, output_h5))

output_file_path = 'slimmed_ntuples/signal700_10files.h5'
concatenate_h5_in_folder(output_file_path)


slimmed_ntuples/signal700_h5/slimmedNtup_gl700_136.h5
slimmed_ntuples/signal700_h5/slimmedNtup_gl700_139.h5
slimmed_ntuples/signal700_h5/slimmedNtup_gl700_152.h5
slimmed_ntuples/signal700_h5/slimmedNtup_gl700_25.h5
slimmed_ntuples/signal700_h5/slimmedNtup_gl700_396.h5


TypeError: Can't broadcast (721, 12) -> (721,)