## Cluster Studies (Jan)

Here is an updated version of the ClusterStudies.ipynb notebook, using uproot4 and newer versions of our plotting utilities.

This notebook is designed to use ntuples produced by the image-based version of MLTree (i.e. not the latest version of MLTree, that makes graph-friendly data with a different format).

In [9]:
h5_name_suffix = 'jdata'
n_max = 600000
cut_distributions = ['clusterEta']
cut_values = [(-0.7,0.7)]
cut_types = ['window']

In [3]:
#import libraries and some constants
import os, sys, pathlib
import numpy as np
import pandas as pd
import uproot as ur

path_prefix = os.getcwd() + '/../'
plotpath = '{}/Plots/'.format(os.getcwd())

try: os.makedirs(plotpath)
except: pass

if(path_prefix not in sys.path): sys.path.append(path_prefix)
from util import ml_util as mu
from util import qol_util as qu

# some of our classification/regression-specific utils have useful functions
from util.classification import plot_util as cpu

# metadata
layers = ["EMB1", "EMB2", "EMB3", "TileBar0", "TileBar1", "TileBar2"]
cell_size_phi = [0.098, 0.0245, 0.0245, 0.1, 0.1, 0.1]
cell_size_eta = [0.0031, 0.025, 0.05, 0.1, 0.1, 0.2]
len_phi = [4, 16, 16, 4, 4, 4]
len_eta = [128, 16, 8, 4, 4, 2]
cell_shapes = {layers[i]:(len_eta[i],len_phi[i]) for i in range(len(layers))}
cell_widths = {layers[i]:(cell_size_eta[i],cell_size_phi[i]) for i in range(len(layers))}

Welcome to JupyROOT 6.24/02


2021-08-04 10:01:50.124149: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [None]:
rootfiles = {
    'jet':inputpath + '**/*.root'
}
branches = [
        'clusterE', 'clusterECalib', 
        'clusterPt', 'clusterEta', 'clusterPhi', 
        'cluster_nCells', 'cluster_sumCellE', 
        'cluster_ENG_CALIB_TOT', 'cluster_EM_PROBABILITY'
] 

In [10]:
  # fancy display names for each pion type
pi_text = {
    'p0': 'pi0',
    'pp': 'pi +/-',
    'jet':'jet'
}

In [11]:
# Prepare data
h5_name = inputpath + h5_name_suffix
pdata,pcells = mu.setupPionData(
    rootfiles, 
    branches=branches, 
    layers=layers, 
    balance_data=True, 
    n_max = n_max,
    verbose=True,
    load=True,
    save=True,
    filename=h5_name,
    match_distribution='cluster_ENG_CALIB_TOT',
    match_binning = (20000,0.,2000.),
    cut_distributions=cut_distributions,
    cut_values=cut_values,
    cut_types=cut_types
)
    
for key,frame in pdata.items():
    n = len(frame)
    print("Number of {a:<7} events: {b:>10}\t({c:.1f}%)".format(a=pi_text[key], b = n, c = 100. * n / total))
print("Total: {}".format(total))

Loading pandas DataFrame and calo images from /local/home/jano/ml4pions/LCStudies/clusters/../data/pion/tdata_60GeV_central_frame.h5 and /local/home/jano/ml4pions/LCStudies/clusters/../data/pion/tdata_60GeV_central_images.h5.
Number of pi0     events:     600000	(50.0%)
Number of pi +/-  events:     600000	(50.0%)
Total: 1200000
