# Energy Evaluation

This notebook just evaluates our classification + energy regressions on our jet data. The same processes are done in `JetClustering.ipynb`, but here we leave out all the stuff with jet clustering (which is only interesting once we have good performance on the topo-clusters).

### Setup

In [1]:
import os, uuid
path_prefix = os.getcwd() + '/../'

# Flag for skipping file preparation, if it has already been done.
skip_scores = False

# Debug: Uses only one input file, which will speed things up.
debug = True

# classification threshold -- above = charged, below = neutral
classification_threshold = 0.6

# explicitly set the directory where the classifier is (for now we only deploy 1 classification at a time)
classification_dir = path_prefix + 'classifier/Models/pion'

# give a list of directories for regressions -- we may try multiple models
regression_dirs = ['pion','pion_reweighted']
regression_dirs = [path_prefix + 'regression/Models/' + x for x in regression_dirs]

In [2]:
# Imports - generic stuff
import numpy as np
#import pandas as pd
import ROOT as rt
import uproot as ur # uproot for accessing ROOT files quickly (and in a Pythonic way)
import sys, os, glob, uuid # glob for searching for files, uuid for random strings to name ROOT objects and avoid collisions
import subprocess as sub
from numba import jit
from pathlib import Path
from IPython.utils import io # For suppressing some print statements from functions.

if(path_prefix not in sys.path): sys.path.append(path_prefix)
from util import ml_util as mu # for passing calo images to regression networks
from util import qol_util as qu # for progress bar
from util import jet_util as ju

Welcome to JupyROOT 6.22/02


In [3]:
# To display our plots, let's get a dark style that will look nice in presentations (and JupyterLab in dark mode).
dark_style = qu.PlotStyle('dark')
light_style = qu.PlotStyle('light')
plot_style = dark_style
plot_style.SetStyle() # sets style for plots - still need to adjust legends, paves

Now we import `tensorflow` (and some of its `keras` stuff), as well as some stuff from `sklearn` for neural network I/O scaling.

In [4]:
# Imports and setup for TensorFlow and Keras.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # disable some of the tensorflow info printouts, only display errors
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

ngpu = 1
gpu_list = ["/gpu:"+str(i) for i in range(ngpu)]
strategy = tf.distribute.MirroredStrategy(devices=gpu_list)
ngpu = strategy.num_replicas_in_sync
print ('Number of devices: {}'.format(ngpu))

# Dictionary for storing all our neural network models that will be evaluated
network_models = {}

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1


Setting up a whole bunch of paths. We use `source = pion` by default, whereby we use networks trained on the single-pion data. We can alternatively use `source = jet` to use our "facsimile jet training data" -- a subset of jet data where we have tried to match topo-clusters to pions -- but this is not fully implemented yet. For example, this workflow explicitly re-derives the network scalers using the `pion` data, so to use the `jet` data we will have to modify that code.

In [5]:
data_dir = path_prefix + 'data/jet'
fj_dir   = path_prefix + '/setup/fastjet/fastjet-install/lib/python3.8/site-packages'
plot_dir = path_prefix + 'jets/clusterPlots/'

try: os.makedirs(plot_dir)
except: pass

In [6]:
# ----- Calorimeter meta-data -----
layers = ["EMB1", "EMB2", "EMB3", "TileBar0", "TileBar1", "TileBar2"]
nlayers = len(layers)
cell_size_phi = [0.098, 0.0245, 0.0245, 0.1, 0.1, 0.1]
cell_size_eta = [0.0031, 0.025, 0.05, 0.1, 0.1, 0.2]
len_phi = [4, 16, 16, 4, 4, 4]
len_eta = [128, 16, 8, 4, 4, 2]
assert(len(len_phi) == nlayers)
assert(len(len_eta) == nlayers)
meta_data = {
    layers[i]:{
        'cell_size':(cell_size_eta[i],cell_size_phi[i]),
        'dimensions':(len_eta[i],len_phi[i])
    }
    for i in range(nlayers)
}

In [7]:
# flat classifiers
print('Loading flat classification models... ')
flat_model_files = glob.glob(classification_dir + '/flat/' + '*.h5')
flat_model_files.sort()
flat_model_names = []
for model in flat_model_files:
    model_name = model.split('model_')[-1].split('_flat')[0]
    print('\tLoading ' + model_name + '... ',end='')
    flat_model_names.append(model_name)
    network_models[model_name] = tf.keras.models.load_model(model)
    print('Done.')

# combo classifier
print('Loading simple combo classification model... ',end='')
combo_model_file = classification_dir + '/simple/' + 'model_simple.h5'
network_models['combo'] = tf.keras.models.load_model(combo_model_file)
print('Done.')

# energy regression networks
charged_keys = {}
neutral_keys = {}

n_regressions = len(regression_dirs)
print('Loading {} sets of energy regression models.'.format(n_regressions))

for i in range(n_regressions):
    charged_key = 'e_charged' + '_' + str(i).zfill(2)
    neutral_key = 'e_neutral' + '_' + str(i).zfill(2)
    
    charged_keys[regression_dirs[i]] = charged_key
    neutral_keys[regression_dirs[i]] = neutral_key
    
    # we will look for h5 files -- one with "charged" in the name, one with "neutral" in the name
    modelfiles = glob.glob(regression_dirs[i] + '/*.h5')
    charged_model_file = 0
    neutral_model_file = 0
    for file in modelfiles:
        if('charged') in file: charged_model_file = file
        elif('neutral') in file: neutral_model_file = file
    assert(charged_model_file !=0)
    assert(neutral_model_file !=0)

    print('\tLoading charged-pion energy regression model #{}... '.format(i+1),end='')
    network_models[charged_key] = tf.keras.models.load_model(charged_model_file)
    print('Done.')
    
    print('\tLoading neutral-pion energy regression model #{}... '.format(i+1),end='')
    network_models[neutral_key] = tf.keras.models.load_model(neutral_model_file)
    print('Done.')

Loading flat classification models... 
	Loading EMB1... Done.
	Loading EMB2... Done.
	Loading EMB3... Done.
	Loading TileBar0... Done.
	Loading TileBar1... Done.
	Loading TileBar2... Done.
Loading simple combo classification model... Done.
Loading 1 sets of energy regression models.
	Loading charged-pion energy regression model #1... Done.
	Loading neutral-pion energy regression model #1... Done.


Now we make a "local" copy of the jet data. We will only copy over certain branches, and we will skip any files that don't contain an `eventTree` in them.

In [8]:
# our "local" data dir, where we create modified data files
jet_data_dir = path_prefix + 'jets/cluster_data'
Path(jet_data_dir).mkdir(parents=True, exist_ok=True)

if(skip_scores):
    data_filenames = glob.glob(jet_data_dir + '/*.root')
    
    # debugging - take only one file, for speed
    if(debug): data_filenames = [data_filenames[0]]
    
else:
    data_filenames = glob.glob(data_dir + '/' + '*.root')

    # debugging - lets us use a single file to speed stuff up a lot.
    if(debug): data_filenames = [data_dir + '/' + 'user.angerami.21685345.OutputStream._000062.root']

    # Get the original data.
    files = {name:rt.TFile(name,'READ') for name in data_filenames}

    # Some data files might be missing an EventTree.
    # For now, we will skip these because our methods count on an existing EventTree.
    delete_keys = []
    for key, val in files.items():
        file_keys = [x.GetName() for x in val.GetListOfKeys()]
        if('ClusterTree' not in file_keys or 'EventTree' not in file_keys):
            delete_keys.append(key)

    for key in delete_keys: 
        print('Ignoring file:',key,'(no EventTree/ClusterTree found).')
        del files[key]

    # now we make a local copy of the files in the jet_data_dir, keeping only certain branches
    active_branches = {}
    active_branches['cluster'] = [
        'runNumber',
        'eventNumber',
        'truthE',
        'truthPt',
        'truthEta',
        'truthPhi',
        'clusterIndex',
        'nCluster',
        'clusterE',
        'clusterECalib',
        'clusterPt',
        'clusterEta',
        'clusterPhi',
        'cluster_nCells',
        'cluster_ENG_CALIB_TOT',
        'EMB1',
        'EMB2',
        'EMB3',
        'TileBar0',
        'TileBar1',
        'TileBar2'
    ]

    tree_names = {'cluster':'ClusterTree'}
    data_filenames = []

    l = len(files.keys())
    i = 0
    qu.printProgressBarColor(i, l, prefix='Copying data files:', suffix='Complete', length=50)

    for path, tfile in files.items():
        filename_new = jet_data_dir + '/' + path.split('/')[-1]
        old_trees = {x:tfile.Get(tree_names[x]) for x in tree_names.keys()}
    
        for key, tree in old_trees.items():
            tree.SetBranchStatus('*',0)
            for bname in active_branches[key]: tree.SetBranchStatus(bname,1)
    
        tfile_new = rt.TFile(filename_new,'RECREATE')
        new_trees = {x:old_trees[x].CloneTree() for x in old_trees.keys()}
        tfile_new.Write()
        data_filenames.append(filename_new)
        i += 1
        qu.printProgressBarColor(i, l, prefix='Copying data files:', suffix='Complete', length=50)
        del old_trees
        del new_trees

Copying data files: |[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m[32m█[0m| 100.0% Complete


In [9]:
# Access the files & trees with uproot
tree_names = {'cluster':'ClusterTree'}
ur_trees = {file:{tree_key:ur.open(file)[tree_name] for tree_key,tree_name in tree_names.items()} for file in data_filenames}

Besides our models and the data, we also need the *scalers* associated with the regression models. We will apply these to the data.

In [10]:
import joblib as jl

# Fetch the scalers associated with the regression models
scaler_file = 'scalers.save' # name is always the same (for now)
scalers = {rd:jl.load(rd + '/' + scaler_file) for rd in regression_dirs}
keys = ['e','cal','eta']
for key, scaler_dict in scalers.items():
    for key2 in keys:
        scaler_dict[key2]['charged'] = scaler_dict[key2]['pp']
        scaler_dict[key2]['neutral'] = scaler_dict[key2]['p0']
        del scaler_dict[key2]['pp']
        del scaler_dict[key2]['p0']

### Getting network outputs for all clusters

Now we will loop over our data files, and get network scores (classification and predicted energies) for all clusters. Note that the latter involves *scaling* of the data, which we will achieve using the scalers that we extracted from the training data above.

This isn't the most notebook-esque code, as we're preparing a bunch of inputs *within* the big for loop below (and not saving them or printing them) but it should avoid "out of memory" issues: As we are dealing with a large amount of data, preparing all the data in memory before operating on it will result in very high memory usage.

In [17]:
# branch buffer for filling our score trees
    # make our branch buffer
branch_buffer = {'charged_likelihood_combo': np.zeros(1,dtype=np.dtype('f8'))}

for key in charged_keys.keys():
    branch_buffer[charged_keys[key]] = np.zeros(1,dtype=np.dtype('f8'))
    branch_buffer[neutral_keys[key]] = np.zeros(1,dtype=np.dtype('f8'))

# Name for the tree that will contain network scores.
tree_name = 'ScoreTree'

for dfile, trees in ur_trees.items():
    
    if(skip_scores): 
        # Explicitly check if ScoreTree is present, otherwise we recompute.
        # Useful if score computation was previously interrupted.
        file_keys = [str(x,'utf-8') for x in list(ur.open(dfile).keys())]
        skip = [tree_name in fkey for fkey in file_keys]
        if(True in skip): continue    
        
    print ('File:',dfile)
    # Prepare the calo images.
    print('\tPrepping calo images...')
    calo_images = {}
    for layer in layers:
        calo_images[layer] = mu.setupCells(trees['cluster'],layer)
    combined_images = np.concatenate(tuple([calo_images[layer] for layer in layers]), axis=1)

    # Prepare some extra combined input for the energy regressions.
    print('\tPrepping extra inputs...')
    
    e = trees['cluster'].array('clusterE')
    e_calib = trees['cluster'].array('cluster_ENG_CALIB_TOT')
    eta = trees['cluster'].array('clusterEta')
    
    # cleaning for e_calib (empirically needed for e_calib to remove values that are too small)
    epsilon = 1.0e-12 #1.0e-12 # TODO: Should I set this to energy_cut as defined in regression training? Would that make sense?
    e_calib = np.where(e_calib < epsilon, epsilon, e_calib)
    
    s_combined,scaler_combined = mu.standardCells(combined_images, layers) # Note: scaler_combined is unused
    
    # find network scores
    print('\tCalculating network outputs...')
    model_scores = {}
    
    print('\t\tClassification... ', end='')
    # 1) flat networks
    for layer in flat_model_names:
        model = network_models[layer]
        model_scores[layer] = model.predict(calo_images[layer])[:,1] # [:,1] based on Max's code, this is input to combo network. Likelihood of being charged (vs. neutral)
    
    # 2) combo network
    name = 'combo'
    model = network_models[name]
    input_scores = np.column_stack([model_scores[layer] for layer in layers])
    model_scores[name] = model.predict(input_scores)[:,1] # likelihood of being charged pion (versus neutral pion)
    print('Done.')
    
    print('\t\tRegression... ', end='')
    # 3) energy regression networks
    
    for i, reg in enumerate(charged_keys.keys()):
        print('\t\t\tLoading regression #{}'.format(i+1))
        regression_input = {}
        for key in scalers[reg]['cal'].keys():
            s_logE = scalers[reg]['e'  ][key].transform(np.log(e).reshape(-1,1))
            s_eta  = scalers[reg]['eta'][key].transform(eta.reshape(-1,1))
            regression_input[key] = np.column_stack((s_logE, s_eta,s_combined))
        
        # charged model
        name = charged_keys[reg]
        model = network_models[name]
        model_scores[name] = np.exp(scalers[reg]['cal']['charged'].inverse_transform(model.predict(regression_input['charged'])))
        
        # neutral model
        name = neutral_keys[reg]
        model = network_models[name]
        model_scores[name] = np.exp(scalers[reg]['cal']['neutral'].inverse_transform(model.predict(regression_input['neutral'])))
    
    # Now we should save these scores to a new tree.
    f = rt.TFile(dfile, 'UPDATE')
    t = rt.TTree(tree_name, tree_name)
    
    print('Saving network scores to tree ' + tree_name + '... ',end='')    
    # --- Setup the branches using our buffer. This is a rather general/flexible code block. ---
    branches = {}
    for bname, val in branch_buffer.items():
        descriptor = bname
        bshape = val.shape
        if(bshape != (1,)):
            for i in range(len(bshape)):
                descriptor += '[' + str(bshape[i]) + ']'
        descriptor += '/'
        if(val.dtype == np.dtype('i2')): descriptor += 'S'
        elif(val.dtype == np.dtype('i4')): descriptor += 'I'
        elif(val.dtype == np.dtype('i8')): descriptor += 'L'
        elif(val.dtype == np.dtype('f4')): descriptor += 'F'
        elif(val.dtype == np.dtype('f8')): descriptor += 'D'
        else:
            print('Warning, setup issue for branch: ', key, '. Skipping.')
            continue
        branches[bname] = t.Branch(bname,val,descriptor)
    
    # Fill the model score tree, and save it to the local data file.
    nentries = model_scores['combo'].shape[0]
    for i in range(nentries):
        branch_buffer['charged_likelihood_combo'][0] = model_scores['combo'][i]
        for key in charged_keys.keys():
            branch_buffer[charged_keys[key]][0] = model_scores[charged_keys[key]][i]
            branch_buffer[neutral_keys[key]][0] = model_scores[neutral_keys[key]][i]
        t.Fill()
    
    t.Write(tree_name, rt.TObject.kOverwrite)
    f.Close()
    print('Done.')
    
tree_names['score'] = tree_name
ur_trees = {file:{tree_key:ur.open(file)[tree_name] for tree_key,tree_name in tree_names.items()} for file in data_filenames}

File: /local/home/jano/ml4pions/LCStudies/jets/../jets/cluster_data/user.angerami.21685345.OutputStream._000062.root
	Prepping calo images...
	Prepping extra inputs...
	Calculating network outputs...
		Classification... Done.
		Regression... 			Loading regression #1


  model_scores[name] = np.exp(scalers[reg]['cal']['neutral'].inverse_transform(model.predict(regression_input['neutral'])))


Saving network scores to tree ScoreTree... Done.


Before moving on to jet clustering, we can already check to see if our energy regressions seem sensible. Let's make distributions of:
- The classification score
- Each regressed energy, for **all** clusters (i.e. charged and neutral energy regressions for all clusters regardless of their classifications)
- Regressed energy / reco energy, where we choose the regressed energy for each cluster based on its classification score

In [None]:
def EnergyRatioHist(ur_trees, charged_key, neutral_key, classification_threshold = 0.6, nbins = 10000, xmin = 1.0e-3, xmax = 1.0e2, color=rt.kViolet-6):
    energy_ratio_hist = rt.TH1F(str(uuid.uuid4()), 'Predicted Energy / E_{CALIB}^{TOT};E_{pred} / E_{CALIB}^{TOT};Count',nbins,xmin,xmax)
    for dfile, trees in ur_trees.items():
        scores = trees['score'].array('charged_likelihood_combo')
        charged_e = trees['score'].array(charged_key)
        neutral_e = trees['score'].array(neutral_key)
        true_e = trees['cluster'].array('cluster_ENG_CALIB_TOT')
    
        for i in range(len(scores)):
            if(true_e[i] == 0.): continue
            if(scores[i] > classification_threshold): energy_ratio_hist.Fill(charged_e[i] / true_e[i])
            else: energy_ratio_hist.Fill(neutral_e[i] / true_e[i])

    energy_ratio_hist.SetFillColorAlpha(color,1.)
    energy_ratio_hist.SetLineColorAlpha(color,1.)
    
    return energy_ratio_hist

def EnergyRatioHist2D(ur_trees, charged_key, neutral_key, class_min = 0., class_max = 1., nsteps = 20, nbins = 10000, xmin = 1.0e-3, xmax = 1.0e2):
    # to use our existing infrastructure, we start off by making a bunch of 1D histograms, that correspond with "rows" in our 2D histogram
    classifier_thresholds = np.linspace(class_min, class_max, nsteps)
    hists = [EnergyRatioHist(ur_trees, charged_key, neutral_key, x, nbins, xmin, xmax) for x in classifier_thresholds]
    # now construct a 2D histogram, and fill it from the 1D histograms
    title = 'Classifier Threshold vs. Predicted Energy / E_{CALIB}^{TOT};E_{pred} / E_{CALIB}^{TOT};Classifier Threshold;Count'
    h = rt.TH2F(str(uuid.uuid4()), title, nbins, xmin, xmax, nsteps, class_min, class_max)
    
    for i in range(nsteps):
        [h.SetBinContent(j+1, i+1, hists[i].GetBinContent(j+1)) for j in range(nbins)] # note the 1-indexing for histogram objects
    return h

In [None]:
from energy_ratio import EnergyRatioHist, EnergyRatioHist2D
rt.gStyle.SetOptStat(0)
alpha=0.5

# parameters for our energy ratio histograms
nbins = 10000
xmin = 1.0e-3
xmax = 1.0e2
nsteps = 25
class_min = 0.3
class_max = 0.8

plot_size = (1000,800)
n_plots = 3
c = rt.TCanvas(str(uuid.uuid4()),'network checks',plot_size[0],n_plots * plot_size[1])
c.Divide(1,n_plots)

# classification scores
class_hist = rt.TH1F(str(uuid.uuid4()), 'Classification score (charged likelihood);Score;Count',100,0.,1.)
for dfile, trees in ur_trees.items():
    for score in ur_trees[dfile]['score'].array('charged_likelihood_combo'): class_hist.Fill(score)
class_hist.SetFillColorAlpha(rt.kGreen,alpha)
class_hist.SetLineColorAlpha(rt.kGreen,alpha)
c.cd(1)
class_hist.Draw('HIST')
rt.gPad.SetLogy()

# regressed energy (most likely) / calibration hits
energy_ratio_hist = EnergyRatioHist(ur_trees, charged_key ='e_charged_00',neutral_key ='e_neutral_00',classification_threshold=classification_threshold)
c.cd(2)
energy_ratio_hist.Draw('HIST')
rt.gPad.SetLogx()
rt.gPad.SetLogy()

# regressed energy / calibration hits, as a function of our classification threshold
c.cd(3)
energy_ratio_2d = EnergyRatioHist2D(ur_trees, charged_key='e_charged_00', neutral_key = 'e_neutral_00', class_min=class_min, class_max=class_max, nsteps=nsteps, nbins=nbins, xmin=xmin, xmax=xmax)
energy_ratio_2d.SetBarWidth(0.4)
energy_ratio_2d.SetLineColor(plot_style.curve)
energy_ratio_2d.SetFillColor(plot_style.main)
energy_ratio_2d.Draw('CANDLEY3')
rt.gPad.SetLogx()
rt.gPad.SetRightMargin(0.2)

# Let's keep track of how many clusters have cluster_ENG_CALIB_TOT = 0.
n_tot = 0
zero_energies = 0
for dfile, trees in ur_trees.items():
    eng_calib_tot = trees['cluster'].array('cluster_ENG_CALIB_TOT')
    n_tot += len(eng_calib_tot)
    zero_energies += np.sum(eng_calib_tot == 0.)

print('Number of clusters with ENG_CALIB_TOT == 0: {val1:.2e} ({val2:.2f}% of clusters)'.format(val1 = zero_energies, val2 = 100. * zero_energies / n_tot))

c.SaveAs(plot_dir + '/' + 'cluster_plots.png')
c.Draw()

For comparing performance between different network configurations, it will be helpful to save the histograms of predicted/true energy to `ROOT` files. This way, we can easily combine a bunch of them later for a rigorous comparison.