# Boost and Rotate

In [1]:
import uproot as ur
import uproot_methods as urm
import numpy as np
import awkward
import matplotlib.pyplot as plt
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils.np_utils import to_categorical   
from sklearn.metrics import roc_curve

import tools

filename = 'user.jagrundy.20736236._000001.MiniNTuple.root'

Using TensorFlow backend.


Load ntuple, get the data we need from the file

In [2]:
s_table = tools.open_file(filename, sort_by="tag")

sorting data by tag


Filters

In [3]:
# filter by realistic situation where we have 3 tags and 3 or 4 jets.
# ignore the case where there may be >4 since those are pretty rare
nb4 = (s_table.nbjets == 3) | (s_table.nbjets == 4) # 3 or 4 b-jets exist
nt3 = s_table.nbtags==3  # 3 b tags
nb4nt3 = nb4 & nt3
events = s_table[nb4nt3]

Get jet locations, tag, truth

In [4]:
pt = events["resolved_lv"].pt
eta = events["resolved_lv"].eta
phi = events["resolved_lv"].phi
E = events["resolved_lv"].E
tag = events["tag"] # tag[index] = [1, 1, 1, 0, 0, ...] (len >= 4)
truth = events["truth"]

Pad arrays

In [5]:
cutoff = 10
padding_val = 0

# cut off and pad
padded_true = pad_sequences(truth,padding='post')[:,:cutoff]
padded_tag = pad_sequences(tag, padding='post')[:,:cutoff]
padded_pt = pad_sequences(pt, padding='post', dtype='float32', value = padding_val)[:,:cutoff]
padded_eta = pad_sequences(eta, padding='post', dtype='float32', value = padding_val)[:,:cutoff]
padded_phi = pad_sequences(phi, padding='post', dtype='float32', value = padding_val)[:,:cutoff]
padded_E = pad_sequences(E, padding='post', dtype='float32', value = padding_val)[:,:cutoff]

Filter a bit more for events where we have 3 b-jets correctly tagged (plus maybe one more untagged b-jet)

In [6]:
# 1 = should have been tagged but wasn't. 0 = correctly tagged (or not tagged)
untagged = np.logical_xor(padded_true, padded_tag).astype(int)

# n_untagged[i] = number of untagged real jets in i-th event
n_untagged = np.count_nonzero(untagged, axis=1)

# consider the case where there are <=4 real b jets
# use "u" for untagged<=1
untagged_u = untagged[n_untagged <= 1]
pt_u = padded_pt[n_untagged <= 1]
eta_u = padded_eta[n_untagged <= 1]
phi_u = padded_phi[n_untagged <= 1]
E_u = padded_E[n_untagged <= 1]
truth_u = padded_true[n_untagged <= 1]
tag_u = padded_tag[n_untagged <= 1]

# there's one weird event, which we'll remove
non_weird_indices = list(range(len(truth_u)))
for i, t in enumerate(truth_u):
    if not all(t[:3] == np.array([1,1,1])):
        print(t)
        print(tag_u[i])
        non_weird_indices.pop(i)
non_weird_indices = np.array(non_weird_indices)


untagged_u = untagged_u[non_weird_indices]
pt_u = pt_u[non_weird_indices]
eta_u = eta_u[non_weird_indices]
phi_u = phi_u[non_weird_indices]
E_u = E_u[non_weird_indices]
truth_u = truth_u[non_weird_indices]
tag_u = tag_u[non_weird_indices]


[1 1 0 0 0 0 0 0 0 0]
[1 1 1 0 0 0 0 0 0 0]


Rotation time!

In [7]:
# make vectors
vectors = urm.TLorentzVectorArray.from_ptetaphie(pt_u, eta_u, phi_u, E_u)

# get sum vectors
x_sum = np.repeat(np.sum(vectors.x, axis=1).reshape(-1, 1), cutoff, axis=1)
y_sum = np.repeat(np.sum(vectors.y, axis=1).reshape(-1, 1), cutoff, axis=1)
z_sum = np.repeat(np.sum(vectors.z, axis=1).reshape(-1, 1), cutoff, axis=1)
t_sum = np.repeat(np.sum(vectors.t, axis=1).reshape(-1, 1), cutoff, axis=1)
v_sum = urm.TLorentzVectorArray(x_sum, y_sum, z_sum, t_sum)

# b for boosted
vectors_b = vectors.boost(-v_sum.boostp3)
v_sum_b = v_sum.boost(-v_sum.boostp3)
# for filler data where eta = 0, we'll have NaN for eta, so replace that
eta = np.nan_to_num(vectors_b.eta, nan=0.0)
vectors_b = urm.TLorentzVectorArray.from_ptetaphie(
    vectors_b.pt, eta, vectors_b.phi, vectors_b.E)

# now rotate the system based on the first 3 jets
# get sum of the first 3, similarly to before
x_sum3 = np.repeat(np.sum(vectors_b.x[:,:3], axis=1).reshape(-1, 1), cutoff, axis=1)
y_sum3 = np.repeat(np.sum(vectors_b.y[:,:3], axis=1).reshape(-1, 1), cutoff, axis=1)
z_sum3 = np.repeat(np.sum(vectors_b.z[:,:3], axis=1).reshape(-1, 1), cutoff, axis=1)
t_sum3 = np.repeat(np.sum(vectors_b.t[:,:3], axis=1).reshape(-1, 1), cutoff, axis=1)
v_sum3 = urm.TLorentzVectorArray(x_sum3, y_sum3, z_sum3, t_sum3)

# rotate about z so that phi=0 for v_sum3
vectors_r = vectors_b.rotatez(-v_sum3.phi)
v_sum3 = v_sum3.rotatez(-v_sum3.phi)

# and again replace filler etas with 0
eta = np.nan_to_num(vectors_r.eta, nan=0.0)
vectors_final = urm.TLorentzVectorArray.from_ptetaphie(
    vectors_r.pt, eta, vectors_r.phi, vectors_r.E)

In [8]:
# snip off the 3 first jets since they're already tagged correctly
# (given our filtering procedure above)
untagged_3, untagged_rest = untagged_u[:, :3], untagged_u[:, 3:]
pt_3, pt_rest = vectors_final.pt[:, :3], vectors_final.pt[:, 3:]
eta_3, eta_rest = vectors_final.eta[:, :3], vectors_final.eta[:, 3:]
phi_3, phi_rest = vectors_final.phi[:, :3], vectors_final.phi[:, 3:]

In [28]:
# find the best jet from _rest that fits with _3, in terms of eta normalization
eta_3_sums = np.repeat(np.sum(eta_3, axis=1).reshape(-1, 1), cutoff-3, axis=1)
eta_sums = np.abs(eta_rest + eta_3_sums)
lowest_vals = np.min(eta_sums, axis=1)
lowest_indices = np.argmin(eta_sums, axis=1)

# lowest values are typically less than 1, lower=more certainty
# let's set an arbitrary threshold,
# and say if lowest_val > thresh, pick no jet
thresh = 0.05
lowest_indices[lowest_vals>thresh] = 7
#print(lowest_vals[:5])
#print(lowest_indices[:5])  

# put this in a better format
selection_index = lowest_indices + 3
selections = np.zeros((len(truth_u), cutoff+1), dtype=int)
for i, s in enumerate(selection_index):
    selections[i][s] = 1
# chop off last index so selection = [0,...,0] for no selection
selections = selections[:, :-1]

# compare to tag_u, truth_u
tools.evaluate_model(truth_u, tag_u, selections)

100%|██████████| 303925/303925 [00:02<00:00, 140628.73it/s]

    Total number of events: 303925
    Minus events ignored: 0, (0.00%)

    4th b-jet really exists:
        Correct 4th jet picked:         4.23%, 7234
        Incorrect 4th jet picked:       6.92%, 11819
        Event incorrectly ignored:      88.85%, 151818

    No 4th b-jet really exists:
        Correctly ignored event:        91.19%, 121333
        Incorrectly picked a 4th jet:   8.81%, 11721

    Or formatted in table form:
                    ____________________
                   |Truth-Matching      |
                   |____________________|
                   |4th exists  |No 4th |
     ______________|____________|_______|
    |4th |4th found|corr. 004.2%| 008.8%|
    |Jet |         |inco. 006.9%|       |
    |Reco|_________|____________|_______|
    |    |no 4th   |      088.8%| 091.2%|
    |____|_________|____________|_______|

    (columns add to 100% each)
    

