In [1]:
import os, glob, re
import shutil
import random
import json
import pyarrow.parquet as pq
import numpy as np
import h5py
import math
import time
from tqdm import tqdm
from multiprocessing import Pool

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import mplhep as hep
plt.style.use([hep.style.ROOT, hep.style.firamath])
minimum_nonzero_pixels = 3

def estimate_population_parameters(all_sample_sizes, all_sample_means, all_sample_stds):
    population_means = []
    population_stds = []
    for j in range(len(all_sample_means)):
        sample_means = all_sample_means[j]
        sample_stds = all_sample_stds[j]
        sample_sizes = all_sample_sizes[j]
        sample_means = sample_means[sample_sizes != 0]
        sample_stds = sample_stds[sample_sizes != 0]
        sample_sizes = sample_sizes[sample_sizes != 0]
        weighted_sum_of_variances = sum((n - 1) * s**2 for n, s in zip(sample_sizes, sample_stds))
        total_degrees_of_freedom = sum(n - 1 for n in sample_sizes)
        combined_variance = weighted_sum_of_variances / total_degrees_of_freedom
        population_std = np.sqrt(combined_variance)
        weighted_sum_of_means = sum(n * mean for n, mean in zip(sample_sizes, sample_means))
        total_observations = sum(sample_sizes)
        population_mean = weighted_sum_of_means / total_observations
        population_stds.append(population_std)
        population_means.append(population_mean)

    return population_means, population_stds

def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [int(c) if c.isdigit() else c for c in re.split('([0-9]+)',s)]

In [48]:
def combined_mean_std(size, mean, std):
    mean_ = np.dot(size, mean)/np.sum(size)
    std_ = np.sqrt((np.dot((np.array(size)-1), np.square(std)) + np.dot(size,np.square(mean-mean_)))/(np.sum(size)-1))
    return mean_, std_

# mean_ = []
# std_ = []
# size_ = []
# file_path = np.sort(glob.glob("mean_std_record_original_dataset/*"))
# for file in file_path:
#     with open(file, 'r') as file:
#         data = json.load(file)
#     mean_.append(data['original_mean'])
#     std_.append(data['original_std'])
#     size_.append(data['number_of_jets'])
# mean = np.array(mean_)
# std = np.array(std_)
# orig_size = np.array(size_)


# orig_mean, orig_std = combined_mean_std(orig_size, mean, std)
# print("original mean  :" , orig_mean,"\n")
# print("original std  :" , orig_std,"\n")
# print("totoal samples  :" , orign_size,"\n")


### Calculate combined mean and std for data after outlier Run this before converting to normalised h5-----------------------------------------------------------------------------------------------------------



mean_ = []
std_ = []
size_ = []
file_path = np.sort(glob.glob("mean_std_record_after_outlier/*"))
for file in file_path:
    with open(file, 'r') as file:
        data = json.load(file)
    mean_.append(data['after_outlier_mean'])
    std_.append(data['after_outlier_std'])
    size_.append(data['number_of_selected_jets'])
mean = np.array(mean_)
std = np.array(std_)
size = np.array(size_)


after_outlier_mean, after_outlier_std = combined_mean_std(size, mean, std)
nan_replace = - after_outlier_mean/after_outlier_std

dim = (125, 125)

# Generate the desired array
nan_replace_array = np.array([np.full(dim, v) for v in nan_replace])

print("after outlier mean  : " , after_outlier_mean,"\n")
print("after outlier std  : " , after_outlier_std,"\n")
print("total selected jets : " , size, "\n")
print("Nan repalced by:   ",nan_replace, "\n")
print(nan_replace_array.shape)

after outlier mean  :  [ 1.95973739 -0.91428634  0.41695268  0.4351373   0.02550794  1.03056946
  1.02679871  1.03097382  1.03844135  1.62629992  1.6815035   1.68042818
  1.68519924] 

after outlier std  :  [2.64603079e+01 2.85947850e+02 2.78975093e+01 2.07958377e+00
 8.02803342e-02 1.82661149e-01 1.69144090e-01 1.82877912e-01
 2.07325558e-01 9.95635728e-01 1.09017309e+00 1.07802985e+00
 1.12664562e+00] 

total selected jets :  [426826 416817 425975 416835 416735 425962 435157 416918 416843 428532] 

Nan repalced by:    [-7.40632876e-02  3.19738840e-03 -1.49458747e-02 -2.09242499e-01
 -3.17735883e-01 -5.64197402e+00 -6.07055627e+00 -5.63749773e+00
 -5.00874738e+00 -1.63342865e+00 -1.54241883e+00 -1.55879560e+00
 -1.49576691e+00] 

(13, 125, 125)


In [41]:
 
    
file = '/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_normalised_combined.hd5'
data = h5py.File(f'{file}', 'r')
num_images = data["all_jet"].shape[0]
num_images = 6400
batch_size = 3200
print(f"processing file ---> {file}\n")
outdir = '/pscratch/sd/b/bbbam/'
outfile = f'IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_normalized_NAN_removed_train.h5'

with h5py.File(f'{outdir}/{outfile}', 'w') as proper_data:
    dataset_names = ['all_jet', 'am', 'ieta', 'iphi', 'm0']
    datasets = {
        name: proper_data.create_dataset(
            name,
            (size,13, 125, 125) if 'jet' in name else (size, 1),
            dtype='float32',  # Specify an appropriate data type
            compression='lzf',
            chunks=(batch_size, 13, 125, 125) if 'jet' in name else (1, 1),
        ) for name in dataset_names
    }

    start_idx_, end_idx_ = 0, 0
    for start_idx in tqdm(range(0, num_images, batch_size)):
        end_idx = min(start_idx + batch_size, num_images)
        images_batch = data["all_jet"][start_idx:end_idx, :, :, :]
        am_batch = data["am"][start_idx:end_idx, :]
        ieta_batch = data["ieta"][start_idx:end_idx, :]
        iphi_batch = data["iphi"][start_idx:end_idx, :]
        m0_batch = data["m0"][start_idx:end_idx, :]

        # images_batch = (images_batch - after_outlier_mean.reshape(1, 13, 1, 1)) / after_outlier_std.reshape(1, 13, 1, 1)
        images_batch[np.isnan(images_batch)] = nan_replace_array


        start_idx_ = min(start_idx, end_idx_)
        end_idx_   = min(start_idx_ + images_batch.shape[0], num_images)



        proper_data['all_jet'][start_idx_:end_idx_,:,:,:] = images_batch
        proper_data['am'][start_idx_:end_idx_] = am_batch
        proper_data['ieta'][start_idx_:end_idx_] = ieta_batch
        proper_data['iphi'][start_idx_:end_idx_] = iphi_batch
        proper_data['m0'][start_idx_:end_idx_] = m0_batch


print(">>>>>>>>>>>>>>>DONE>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

processing file ---> /pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_normalised_combined.hd5



ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [49]:
import h5py
import numpy as np
from tqdm import tqdm
file = '/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_normalised_combined.hd5'
data = h5py.File(f'{file}', 'r')
num_images = data["all_jet"].shape[0]
# num_images = 5000  # Adjusted number of images for processing
batch_size = 4000

print(f"Processing file ---> {file}\n")

outdir = '/pscratch/sd/b/bbbam/'
outfile = 'IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_normalized_NAN_removed_train.h5'

with h5py.File(f'{outdir}/{outfile}', 'w') as proper_data:
    dataset_names = ['all_jet', 'am', 'ieta', 'iphi', 'm0']
    datasets = {
        name: proper_data.create_dataset(
            name,
            (num_images, 13, 125, 125) if 'jet' in name else (num_images, 1),
            dtype='float32',
            compression='lzf',
            chunks=(batch_size, 13, 125, 125) if 'jet' in name else (batch_size, 1),
        ) for name in dataset_names
    }

    for start_idx in tqdm(range(0, num_images, batch_size)):
        end_idx = min(start_idx + batch_size, num_images)
        images_batch = data["all_jet"][start_idx:end_idx, :, :, :]
        am_batch = data["am"][start_idx:end_idx, :]
        ieta_batch = data["ieta"][start_idx:end_idx, :]
        iphi_batch = data["iphi"][start_idx:end_idx, :]
        m0_batch = data["m0"][start_idx:end_idx, :]

        # Replace NaN values in images_batch with the specified transformation
        nan_mask = np.isnan(images_batch)
        images_batch[nan_mask] =  np.tile(nan_replace_array, (end_idx-start_idx, 1, 1, 1))[nan_mask]
        # Write the processed batch to the new HDF5 file
        proper_data['all_jet'][start_idx:end_idx, :, :, :] = images_batch
        proper_data['am'][start_idx:end_idx, :] = am_batch
        proper_data['ieta'][start_idx:end_idx, :] = ieta_batch
        proper_data['iphi'][start_idx:end_idx, :] = iphi_batch
        proper_data['m0'][start_idx:end_idx, :] = m0_batch
data.close()
print(">>>>>>>>>>>>>>> DONE >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")


Processing file ---> /pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_normalised_combined.hd5



100%|██████████| 1446/1446 [7:03:53<00:00, 17.59s/it]  

>>>>>>>>>>>>>>> DONE >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>





In [47]:
file = '/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m3p6To14p8_dataset_2_unbaised_v2_normalized_NAN_removed_train.h5'
data = h5py.File(f'{file}', 'r')
for start_idx in tqdm(range(0, num_images, 4000)):
    end_idx = min(start_idx + batch_size, num_images)
    images_batch = data["all_jet"][start_idx:end_idx, :, :, :]
    am_batch = data["am"][start_idx:end_idx, :]
    nan = np.isnan(images_batch)
    print("NaN.shape", nan.shape)
    print("nan:  ",np.any(nan))
    

 50%|█████     | 1/2 [00:03<00:03,  3.96s/it]

NaN.shape (3200, 13, 125, 125)
nan:   False


100%|██████████| 2/2 [00:09<00:00,  4.57s/it]

NaN.shape (1000, 13, 125, 125)
nan:   False



