In [1]:
import os, glob, re
import shutil
import random
import json
import pyarrow.parquet as pq
import numpy as np
import h5py
import matplotlib.pyplot as plt
import argparse
import time
import cupy as cp
from multiprocessing import Pool
import argparse

In [2]:
def estimate_population_parameters(all_sample_sizes, all_sample_means, all_sample_stds):
    population_means = []
    population_stds = []
    for j in range(len(all_sample_means)):
        sample_means = all_sample_means[j]
        sample_stds = all_sample_stds[j]
        sample_sizes = all_sample_sizes[j]
        sample_means = sample_means[sample_sizes != 0]
        sample_stds = sample_stds[sample_sizes != 0]
        sample_sizes = sample_sizes[sample_sizes != 0]
        weighted_sum_of_variances = sum((n - 1) * s**2 for n, s in zip(sample_sizes, sample_stds))
        total_degrees_of_freedom = sum(n - 1 for n in sample_sizes)
        combined_variance = weighted_sum_of_variances / total_degrees_of_freedom
        population_std = np.sqrt(combined_variance)
        weighted_sum_of_means = sum(n * mean for n, mean in zip(sample_sizes, sample_means))
        total_observations = sum(sample_sizes)
        population_mean = weighted_sum_of_means / total_observations
        population_stds.append(population_std)
        population_means.append(population_mean)

    return population_means, population_stds

In [11]:


def mean_std(start_index, end_index, df):
    #df = df[start_index:end_index]
    size_ = []
    mean_ = []
    std_ = []
    
    xj = df.columns.get_loc('X_jet')
    

    im_all = np.array(np.array(np.array(df.iloc[:, xj].tolist()).tolist()).tolist())
   
    
    # hdf5_file["all_jet"][start_index:end_index, :, :, :] = im
    # print("        im = im_all[3,:,:,:])----------",      im_all[3,:,:,:])
    

    for i in range(start_index, end_index):
        im = im_all[i,:,:,:]
        im[im < 1.e-5] = 0
        size_channel = []
        mean_channel = []
        std_channel = []
        bad_channel = False
        for j in range(13):
            if not bad_channel:
                image = im[j,:,:]
                image = image[image != 0]
                if len(image) < 2:
                    bad_channel = True
                    continue
                size_channel.append(len(image))
                mean_channel.append(image.mean())
                std_channel.append(image.std(ddof=1))
        if not bad_channel:
            size_.append(size_channel)
            mean_.append(mean_channel)
            std_.append(std_channel)
        if i > 9: break
    print("size_. ",size_)  
    orig_mean, orig_std = estimate_population_parameters(size_, mean_, std_)
    
    return orig_mean, orig_std

In [12]:
def process_files(args):
    file_path = args[0]
    h5py_file = args[1]
    # batch_size = 4096
    batch_size = 10
    
    print("------Processing file------")
    parquet = pq.ParquetFile(file_path)
    print("file ------>   ", file_path)
    print("Number of row --------> ", parquet.num_row_groups)
    total_samples = parquet.num_row_groups
    batch_iter = parquet.iter_batches(batch_size,use_threads=True)

    start_index = 0
    bat = 0
    for batch in batch_iter:
        df = batch.to_pandas(use_threads=True)
        end_index = start_index + df.shape[0]
        print("File----->",file_path , " Batch no.", bat, "Data frame shape", df.shape, " Start idx:", start_index, " end idx:", end_index)

        if end_index<=total_samples:
            #print("Image shape going in append", im.shape, " ", start_index, " ", end_index)
            mean_std(start_index, end_index, df)
            start_index += df.shape[0]
            break

        # bat +=1


In [13]:

parquet_dir = '/pscratch/sd/b/bbbam/'
h5_dir = '/pscratch/sd/b/bbbam/mean_std_from_parquet/'
if not os.path.exists(h5_dir):
    # Create the directory if it doesn't exist
    os.makedirs(h5_dir)


signal_files = [os.path.join(parquet_dir + 'IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_all/', f) for f in os.listdir(parquet_dir + 'IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_all/')]

combined_files = signal_files




inputfile_list = []
outputfile_list = []

for f in combined_files:
    opFile = 'mean_ste_number_dataset_' + f.split('_')[-2]

    # print("opFile-----", opFile)
    h5_file = h5_dir+opFile+".json"
    print("h5_file----", h5_file)
    #process_files(f, h5_file, batch_size)
    inputfile_list.append(f)
    outputfile_list.append(h5_file)
    tic = time.time()
    break

args = list(zip(inputfile_list,outputfile_list)) 
print("----------------------------------------")
print("arg --------", args)

with Pool(12) as p:
    p.map(process_files,args)
toc = time.time()


print("It took ", toc-tic)

h5_file---- /pscratch/sd/b/bbbam/mean_std_from_parquet/mean_ste_number_dataset_0005.json
----------------------------------------
arg -------- [('/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_all/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_0005_train.parquet', '/pscratch/sd/b/bbbam/mean_std_from_parquet/mean_ste_number_dataset_0005.json')]
------Processing file------
file ------>    /pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_all/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_0005_train.parquet
Number of row -------->  428918
File-----> /pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_all/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_0005_train.parquet  Batch no. 0 Data frame shape (10, 7)  Start idx: 0  end idx: 10
size_.  [[56, 24, 29, 211, 7475, 224, 195, 189, 178, 608, 446, 424, 354], [30, 23, 14, 45, 3800, 

TypeError: 'int' object is not iterable