In [1]:
import os, glob, re
import shutil
import random
import json
import pyarrow.parquet as pq
import numpy as np
import h5py
import matplotlib.pyplot as plt
import argparse
import time
import cupy as cp
from multiprocessing import Pool
import argparse
from tqdm import tqdm

In [2]:
def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [int(c) if c.isdigit() else c for c in re.split('([0-9]+)',s)]

In [3]:
def create_new_hdf5_file(filename, max_rows_per_file):
    #filename = f"{prefix}_{index}.h5"
    hdf5_file = h5py.File(filename, 'w')
    dataset_names = ['all_jet', 'am', 'ieta', 'iphi', 'm0']
    total_samples = max_rows_per_file
    datasets = {
        name: hdf5_file.create_dataset(
        name,
        (total_samples, 13, 125, 125) if 'jet' in name else (total_samples, 1),
        dtype='float32',  # Specify an appropriate data type
        compression='lzf',  # Optional: add compression
        #chunks = (min_samples, 13, 125, 125) if 'jet' in name else (min_samples, 1),
        ) for name in dataset_names
    }
    #hdf5_file.create_dataset('dataset', shape=(0, *data.shape[1:]), maxshape=(max_rows_per_file, *data.shape[1:]), dtype='float64')  # Adjust dtype as per your data type
    return hdf5_file

In [7]:
def append_data_to_hdf5(hdf5_file, start_index, end_index, df):
    #df = df[start_index:end_index]
    
    print("Writing to file", hdf5_file)
    xj = df.columns.get_loc('X_jet')
    am = df.columns.get_loc('am')
    ieta = df.columns.get_loc('ieta')
    iphi = df.columns.get_loc('iphi')
    m0 = df.columns.get_loc('m0')

    im = np.array(np.array(np.array(df.iloc[:, xj].tolist()).tolist()).tolist())
    am = np.array(df.iloc[:,am])
    # print("meta --- ", meta)
    ieta = np.array(df.iloc[:,ieta])
    iphi = np.array(df.iloc[:,iphi])
    m0 = np.array(df.iloc[:,m0])
    
#     print("In Append function dataframe shape----", df.shape)
#     print("shape hdf5----", hdf5_file["all_jet"][start_index:end_index, :, :, :].shape)
#     print("jet image shape----", im.shape)
#     print("am ----", am)
#     print("ieta ----", ieta)
#     print("iphi ----", iphi)
#     print("m0 ----", m0)
#     print("df.shape[0] ----- ",df.shape[0])
    
#     print("start_index--- ", start_index)
#     print("end_index--- ", end_index)
    hdf5_file["all_jet"][start_index:end_index, :, :, :] = im
    # print("np.full((df.shape[0],1), am)      ",np.full((df.shape[0],1), am.reshape(df.shape[0],1).tolist()))
    # hdf5_file["am"][start_index:end_index, :]   = np.full((df.shape[0],1), am.reshape(df.shape[0],1).tolist())
    hdf5_file["am"][start_index:end_index, :]   = am.reshape(df.shape[0],1).tolist()
    hdf5_file["ieta"][start_index:end_index, :] = ieta.reshape(df.shape[0],1).tolist()
    hdf5_file["iphi"][start_index:end_index, :] = iphi.reshape(df.shape[0],1).tolist()
    hdf5_file["m0"][start_index:end_index, :]   = m0.reshape(df.shape[0],1).tolist()
    
    return hdf5_file

In [8]:
#

In [9]:
def process_files(args):
#def process_files(file_path,h5py_file):
    
    file_path = args[0]
    h5py_file = args[1]
    batch_size = 6096
    # batch_size = 10
    
    print("------Processing file------")
    parquet = pq.ParquetFile(file_path)
    print("file ------>   ", file_path)
    total = parquet.num_row_groups
    print("Number of row --------> ", total)
    total_samples = parquet.num_row_groups
    hdf5_file = create_new_hdf5_file(h5py_file,total_samples)
    batch_iter = parquet.iter_batches(batch_size,use_threads=True)

    start_index = 0
    bat = 0
    for batch in batch_iter:
        #batch = next(batch_iter)
        df = batch.to_pandas(use_threads=True)
        end_index = start_index + df.shape[0]
        print("total----->",total , " Batch no.", bat, "Data frame shape", df.shape, " Start idx:", start_index, " end idx:", end_index)

        if end_index<=total_samples:
            #print("Image shape going in append", im.shape, " ", start_index, " ", end_index)
            append_data_to_hdf5(hdf5_file, start_index, end_index, df)
            start_index += df.shape[0]
            # break

        bat +=1


In [5]:
# # parquet_dir = '/pscratch/sd/r/rchudasa/E2E_samples/ParquetFiles_correctTrackerLayerHits_SecVtxInfoAdded/'
# parquet_dir = '/pscratch/sd/b/bbbam/'
# h5_dir = '/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_train_hd5/'
# if not os.path.exists(h5_dir):
#     # Create the directory if it doesn't exist
#     os.makedirs(h5_dir)
# h5_name = 'tau_threads.h5'
# batch_size = 4096
# # batch_size = 10
# # signal_files = [os.path.join(parquet_dir + 'signal/', f) for f in os.listdir(parquet_dir + 'signal/')]
# # bkg_files = [os.path.join(parquet_dir + 'background/', f) for f in os.listdir(parquet_dir + 'background/')]

# signal_files = [os.path.join(parquet_dir + 'IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_all/', f) for f in os.listdir(parquet_dir + 'IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_all/')]

# combined_files = signal_files
# # random.shuffle(combined_files)
# # print("type(combined_files)  :-----", type(combined_files))

# counter = 0

# inputfile_list = []
# outputfile_list = []

# for f in combined_files:
#     opFile       = f.split("/")[-1].split(".")[0]
#     # proceessName = opFile.split("_")[0]
#     # processID    = opFile.split("_")[-1]
#     # print("opFile-----", opFile)
#     h5_file = h5_dir+opFile+".h5"
#     # print("h5_file----", h5_file)
#     #process_files(f, h5_file, batch_size)
#     inputfile_list.append(f)
#     outputfile_list.append(h5_file)
#     tic = time.time()
#     #tic = time.time()
#     #process_files(f,h5_file)
#     #toc = time.time()
#     #print("It took {} minutes to run {} file".format((toc-tic)/60,f))
#     #counter =+ 1
#     #if counter >=10:
#     #break

# #print(inputfile_list[0:10])
# #print(outputfile_list[0:10])
# #args = list(zip(inputfile_list[0:10],outputfile_list[0:10]))
# args = list(zip(inputfile_list,outputfile_list)) 
# print("----------------------------------------")
# print("arg --------", args)

# with Pool(12) as p:
#     # print("**************",args)
#     p.map(process_files,args)
# toc = time.time()


# print("It took ", toc-tic)

In [6]:

# h5_dir = '/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To3p6_dataset_2_unbaised_v2_train_hd5/'
# if not os.path.exists(h5_dir):
#     # Create the directory if it doesn't exist
#     os.makedirs(h5_dir)
# h5_name = 'tau_threads.h5'
# batch_size = 4096


# parquet_files = glob.glob("/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To3p6_m14p8To17p2_dataset_2_unbaised_v2_train/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To3p6_dataset_2_unbaised_unphysical_0009*")

# inputfile_list = []
# outputfile_list = []

# for f in parquet_files:
#     opFile       = f.split("/")[-1].split(".")[0]
#     # proceessName = opFile.split("_")[0]
#     # processID    = opFile.split("_")[-1]
#     # print("opFile-----", opFile)
#     h5_file = h5_dir+opFile+".h5"
#     # print("h5_file----", h5_file)
#     #process_files(f, h5_file, batch_size)
#     inputfile_list.append(f)
#     outputfile_list.append(h5_file)
#     tic = time.time()
#     #tic = time.time()
#     #process_files(f,h5_file)
#     #toc = time.time()
#     #print("It took {} minutes to run {} file".format((toc-tic)/60,f))
#     #counter =+ 1
#     #if counter >=10:
#     #break

# #print(inputfile_list[0:10])
# #print(outputfile_list[0:10])
# #args = list(zip(inputfile_list[0:10],outputfile_list[0:10]))
# args = list(zip(inputfile_list,outputfile_list)) 
# print("----------------------------------------")
# print("arg --------", args)

# with Pool(len(parquet_files)) as p:
#     # print("**************",args)
#     p.map(process_files,args)
# toc = time.time()


# print("It took ", toc-tic)

In [None]:
h5_dir = '/pscratch/sd/b/bbbam/signal_hd5/'
if not os.path.exists(h5_dir):
    # Create the directory if it doesn't exist
    os.makedirs(h5_dir)

batch_size = 4096


parquet_files = glob.glob("/pscratch/sd/b/bbbam/signal/*")

inputfile_list = []
outputfile_list = []

for f in parquet_files:
    opFile       = f.split("/")[-1].split(".")[0]
    # proceessName = opFile.split("_")[0]
    # processID    = opFile.split("_")[-1]
    # print("opFile-----", opFile)
    h5_file = h5_dir+opFile+".h5"
    # print("h5_file----", h5_file)
    #process_files(f, h5_file, batch_size)
    inputfile_list.append(f)
    outputfile_list.append(h5_file)
    tic = time.time()
    #tic = time.time()
    #process_files(f,h5_file)
    #toc = time.time()
    #print("It took {} minutes to run {} file".format((toc-tic)/60,f))
    #counter =+ 1
    #if counter >=10:
    #break

#print(inputfile_list[0:10])
#print(outputfile_list[0:10])
#args = list(zip(inputfile_list[0:10],outputfile_list[0:10]))
args = list(zip(inputfile_list,outputfile_list)) 
print("----------------------------------------")
print("arg --------", args)

with Pool(len(parquet_files)) as p:
    # print("**************",args)
    p.map(process_files,args)
toc = time.time()


print("It took ", toc-tic)

----------------------------------------
arg -------- [('/pscratch/sd/b/bbbam/signal/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M8_signal_v2.parquet', '/pscratch/sd/b/bbbam/signal_hd5/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M8_signal_v2.h5'), ('/pscratch/sd/b/bbbam/signal/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M4_signal_v2_1.parquet', '/pscratch/sd/b/bbbam/signal_hd5/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M4_signal_v2_1.h5'), ('/pscratch/sd/b/bbbam/signal/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M14_signal_v2.parquet', '/pscratch/sd/b/bbbam/signal_hd5/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M14_signal_v2.h5'), ('/pscratch/sd/b/bbbam/signal/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M3p7_signal_v2_1.parquet', '/pscratch/sd/b/bbbam/signal_hd5/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M3p7_signal_v2_1.h5'), ('/pscratch/sd/b/bbbam/signal/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M5_signal_v2_1.parquet', '/pscratch/sd/b/bbbam/signal_hd5/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M5_signal_v2_1.h5'), ('/pscratch/sd/b/bbbam/signal/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M10_signa

/pscratch/sd/b/bbbam/signal/IMG_H_AATo4Tau_Hadronic_tauDR0p4_M5_signal_v2_1.parquet


'IMG_H_AATo4Tau_Hadronic_tauDR0p4_M5_signal_v2_1'