# Can I too be a youth?

In [1]:
import uproot as ur
import uproot_methods as urm

In [2]:
import numpy as np

First thing: open up the ntuple, make sure we have it loaded

In [3]:
sm_file = ur.open("../user.jagrundy.20736236._000001.MiniNTuple.root")
sm_tree = sm_file['XhhMiniNtuple']

FileNotFoundError: [Errno 2] No such file or directory: 'user.jagrundy.20736236._000001.MiniNTuple.root'

In [4]:
print(sm_tree)

NameError: name 'sm_tree' is not defined

What branches are available?

In [5]:
sm_tree.keys()

NameError: name 'sm_tree' is not defined

For messing around, let's load some branches directly into memory

In [6]:
blabel = sm_tree.array('resolvedJets_HadronConeExclTruthLabelID')

NameError: name 'sm_tree' is not defined

In [7]:
btag = sm_tree.array('resolvedJets_is_DL1r_FixedCutBEff_77')

NameError: name 'sm_tree' is not defined

In [8]:
pt  = sm_tree.array('resolvedJets_pt')
eta = sm_tree.array('resolvedJets_eta')
phi = sm_tree.array('resolvedJets_phi')
eng = sm_tree.array('resolvedJets_E')

NameError: name 'sm_tree' is not defined

In [9]:
print(pt)

NameError: name 'pt' is not defined

Let's see if the jet pT's are ordered? Verdict: they're not :( (this is fixed in later version of this data but I don't have it available yet)

In [10]:
for ie, event in enumerate(pt):
    for ij, jetpt in enumerate(event):
        if ij == 0:
            continue
        if jetpt > event[ij-1]:
            print('jetpt {} {} greater than {} {} in event {}'.format(jetpt, ij, event[ij-1], ij-1, ie))

NameError: name 'pt' is not defined

A particular event that is not-ordered: can see jet 0 and jet 1 are out of rder

In [11]:
print(pt[88774])

NameError: name 'pt' is not defined

Another way of loading the data: into a numpy array by name

In [12]:
branches = sm_tree.arrays(branches=['resolvedJets_pt', 'resolvedJets_eta', 'resolvedJets_phi', 'resolvedJets_E', 'resolvedJets_HadronConeExclTruthLabelID', 'resolvedJets_is_DL1r_FixedCutBEff_77'], namedecode='utf-8')

NameError: name 'sm_tree' is not defined

In [13]:
print(branches)

NameError: name 'branches' is not defined

Now we can put them into an awkard.Table, which is like a Pandas frame

In [14]:
import awkward
table = awkward.Table(branches)

NameError: name 'branches' is not defined

In [15]:
print(table)

NameError: name 'table' is not defined

This is a trick from uproot methods: you can make a TLorentzVector type object directly from the four branches that specify the pt/eta/phi/e like this

In [16]:
lv = urm.TLorentzVectorArray.from_ptetaphie(table['resolvedJets_pt'],
                                            table['resolvedJets_eta'],
                                            table['resolvedJets_phi'],
                                            table['resolvedJets_E'])

NameError: name 'table' is not defined

Now this is a trick to sort the 4-vectors. the .pt.argsort() gives us a list of indices, per event, that sorts the jets. You can just apply this to the table, and it'll sort elements by that. So we're going to make a new table (s_table, for sorted) that copies over relevant information and sorts via the [indices] trick.

In [17]:
table['resolved_lv'] = lv

NameError: name 'lv' is not defined

In [18]:
indices = table['resolved_lv'].pt.argsort()

NameError: name 'table' is not defined

In [19]:
s_table = awkward.Table()

In [20]:
s_table['resolved_lv'] = table['resolved_lv'][indices]

NameError: name 'table' is not defined

In [21]:
s_table['resolvedJets_HadronConeExclTruthLabelID'] = table['resolvedJets_HadronConeExclTruthLabelID'][indices]

NameError: name 'table' is not defined

In [22]:
s_table['resolvedJets_is_DL1r_FixedCutBEff_77'] = table['resolvedJets_is_DL1r_FixedCutBEff_77'][indices]

NameError: name 'table' is not defined

And here we see that our old problem child event is properly sorted.

In [23]:
s_table[88774]['resolved_lv'].pt

IndexError: index 88774 out of bounds for length 0

Define some convenience columns here, for number of various things in the dataset

In [24]:
s_table['nbjets'] = awkward.AwkwardArray.count_nonzero(s_table['resolvedJets_HadronConeExclTruthLabelID']==5)
s_table['nbtags'] = awkward.AwkwardArray.count_nonzero(s_table['resolvedJets_is_DL1r_FixedCutBEff_77']==1)
s_table['nfour'] = awkward.AwkwardArray.count_nonzero(s_table['resolvedJets_HadronConeExclTruthLabelID']==4)
s_table['nfourteen'] = awkward.AwkwardArray.count_nonzero(s_table['resolvedJets_HadronConeExclTruthLabelID']==14)
s_table['nfifteen'] = awkward.AwkwardArray.count_nonzero(s_table['resolvedJets_HadronConeExclTruthLabelID']==15)


ValueError: no column named 'resolvedJets_HadronConeExclTruthLabelID'

Let's make some quick plots to understand what's going on.

In [25]:
import matplotlib as mpl
import matplotlib.pyplot as plt

In [26]:
plt.cla(); plt.clf()
fig = plt.figure()
fig.patch.set_facecolor('white')
plt.hist(s_table.nbjets, bins=8)
plt.yscale('log')
plt.xlabel('Number of b-jets')
plt.legend()
plt.show()

AttributeError: no column named 'nbjets'

Define some filters. We can do this by logical requirements on the bjets and btags.

In [27]:
nb4 = s_table.nbjets==4
nb3 = s_table.nbjets==3
nt4 = s_table.nbtags==4
nt3 = s_table.nbtags==3
nb4nt3 = nb4 & nt3 & (s_table.nfour==0) & (s_table.nfourteen==0) & (s_table.nfifteen==0)

AttributeError: no column named 'nbjets'

In [28]:
plt.cla(); plt.clf()
fig = plt.figure()
fig.patch.set_facecolor('white')
plt.hist(s_table[nb4].nbtags, bins=6)
plt.yscale('log')
plt.xlabel('Number of b-tags, with 4 b-jets')
plt.legend()
plt.show()

NameError: name 'nb4' is not defined

In [29]:
len(s_table[nb4nt3])

NameError: name 'nb4nt3' is not defined

In [30]:
s_table['resolvedJets_HadronConeExclTruthLabelID'][nb4nt3]

ValueError: no column named 'resolvedJets_HadronConeExclTruthLabelID'

In [31]:
s_table['resolvedJets_is_DL1r_FixedCutBEff_77'][nb4nt3]

ValueError: no column named 'resolvedJets_is_DL1r_FixedCutBEff_77'

What's the maximum number of jets we have in an event? 18, wow

In [32]:
np.max(awkward.AwkwardArray.count(s_table['resolvedJets_is_DL1r_FixedCutBEff_77']))

ValueError: no column named 'resolvedJets_is_DL1r_FixedCutBEff_77'

Let's just filter this down to 10... the syntax is like this

In [33]:
s_table['resolvedJets_HadronConeExclTruthLabelID'][:10]

ValueError: no column named 'resolvedJets_HadronConeExclTruthLabelID'

Here's a plot of the number of jets, justifying that truncation as maybe being ok

In [34]:
plt.cla(); plt.clf()
fig = plt.figure()
fig.patch.set_facecolor('white')
plt.hist(awkward.AwkwardArray.count(s_table['resolvedJets_is_DL1r_FixedCutBEff_77']))
plt.yscale('log')
plt.xlabel('Number of jets')
plt.legend()
plt.show()

ValueError: no column named 'resolvedJets_is_DL1r_FixedCutBEff_77'

truncate to 10? probably ok

# Here be ML dragons, where code remains a bit uglier than I would like!

In [35]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [36]:
padded_true = pad_sequences(s_table['resolvedJets_HadronConeExclTruthLabelID'][nb4nt3], padding='post')[:,:10]
padded_tag  = pad_sequences(s_table['resolvedJets_is_DL1r_FixedCutBEff_77'][nb4nt3], padding='post')[:,:10]
padded_pt   = pad_sequences(s_table['resolved_lv'][nb4nt3].pt, padding='post', dtype='float32', value = -10)[:,:10]
padded_eta  = pad_sequences(s_table['resolved_lv'][nb4nt3].eta, padding='post', dtype='float32', value = -10)[:,:10]
padded_phi  = pad_sequences(s_table['resolved_lv'][nb4nt3].phi, padding='post', dtype='float32', value = -10)[:,:10]


ValueError: no column named 'resolvedJets_HadronConeExclTruthLabelID'

In [37]:
plt.cla(); plt.clf()
fig = plt.figure()
fig.patch.set_facecolor('white')
plt.hist(padded_true.flatten())
plt.yscale('log')
# plt.xlabel('Learned - Actual Energy')
plt.legend()
plt.show()

NameError: name 'padded_true' is not defined

In [38]:
print(padded_true[:10])

NameError: name 'padded_true' is not defined

In [39]:
print(padded_tag[:10])

NameError: name 'padded_tag' is not defined

In [40]:
true_tag_diff = np.count_nonzero(np.logical_xor(padded_true, padded_tag),axis=1)

NameError: name 'padded_true' is not defined

In [41]:
print(padded_true[true_tag_diff > 1])
print(padded_tag[true_tag_diff > 1])

NameError: name 'padded_true' is not defined

In [42]:
pc_pt = padded_pt[true_tag_diff == 1]
pc_eta = padded_eta[true_tag_diff == 1]
pc_phi = padded_phi[true_tag_diff == 1]

NameError: name 'padded_pt' is not defined

In [43]:
pc_missedjet = np.logical_xor(padded_true, padded_tag)[true_tag_diff == 1].astype(int)

NameError: name 'padded_true' is not defined

I think missed jet is already one-hot encoded basically? nice.

In [44]:
from sklearn.preprocessing import StandardScaler

In [45]:
scaler_pt = StandardScaler()
scaler_eta = StandardScaler()
scaler_phi = StandardScaler()

In [46]:
s_pc_pt = scaler_pt.fit_transform(pc_pt)
s_pc_eta = scaler_eta.fit_transform(pc_eta)
s_pc_phi = scaler_phi.fit_transform(pc_phi)

NameError: name 'pc_pt' is not defined

In [47]:
from sklearn.model_selection import ShuffleSplit
def splitTVT(input, trainfrac = 0.8, testfrac = 0.2):
    valfrac = 1.0 - trainfrac - testfrac
    
    train_split = ShuffleSplit(n_splits=1, test_size=testfrac + valfrac, random_state=0)
    # advance the generator once with the next function
    train_index, testval_index = next(train_split.split(input))  

    if valfrac > 0:
        testval_split = ShuffleSplit(
            n_splits=1, test_size=valfrac / (valfrac+testfrac), random_state=0)
        test_index, val_index = next(testval_split.split(testval_index)) 
    else:
        test_index = testval_index
        val_index = []

    return train_index, val_index, test_index

In [48]:
train, val, test = splitTVT(pc_missedjet, trainfrac=0.7, testfrac=0.2)

NameError: name 'pc_missedjet' is not defined

In [49]:
s_pc_in = np.column_stack((s_pc_pt, s_pc_eta, s_pc_phi))

NameError: name 's_pc_pt' is not defined

In [50]:
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense
from keras.layers import Dropout

In [51]:
model = Sequential()
model.add(Dense(30, input_dim=30, kernel_initializer='normal', activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(30, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(15, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(10, kernel_initializer='normal', activation='softmax'))
        # compile model
optimizer = Adam(lr=5e-5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

In [52]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 30)                930       
_________________________________________________________________
dense_2 (Dense)              (None, 30)                930       
_________________________________________________________________
dense_3 (Dense)              (None, 15)                465       
_________________________________________________________________
dense_4 (Dense)              (None, 10)                160       
Total params: 2,485
Trainable params: 2,485
Non-trainable params: 0
_________________________________________________________________


In [53]:
history = model.fit(s_pc_in[train], pc_missedjet[train], validation_data=(s_pc_in[val], pc_missedjet[val]), epochs = 200, batch_size = 200, verbose = 1)

NameError: name 's_pc_in' is not defined

In [54]:
plt.cla(); plt.clf()
fig = plt.figure()
fig.patch.set_facecolor('white')

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
# plt.savefig('Plots/accuracy_{}.pdf'.format(layer_name))
plt.show()


    # summarize history for loss
fig = plt.figure()
fig.patch.set_facecolor('white')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
# plt.savefig('Plots/loss_{}.pdf'.format(layer_name))
plt.show()


NameError: name 'history' is not defined

In [55]:
nn_score = model.predict(s_pc_in)

NameError: name 's_pc_in' is not defined

In [56]:
print(nn_score[split][:10])

NameError: name 'nn_score' is not defined

In [57]:
print(pc_missedjet[split][:10])

NameError: name 'pc_missedjet' is not defined

In [58]:
best_index_asc = nn_score[test].argsort(axis=1)
best_index = best_index_asc[:,::-1]

NameError: name 'nn_score' is not defined

In [59]:
print(best_index)

NameError: name 'best_index' is not defined

In [60]:
test_mj = pc_missedjet[test]

NameError: name 'pc_missedjet' is not defined

In [61]:
s_test_mj = test_mj[np.arange(test_mj.shape[0])[:,None], best_index]

NameError: name 'test_mj' is not defined

In [62]:
print(s_test_mj)

NameError: name 's_test_mj' is not defined

In [63]:
test_nn_score = nn_score[test]

NameError: name 'nn_score' is not defined

In [64]:
s_test_score = test_nn_score[np.arange(test_nn_score.shape[0])[:,None], best_index]

NameError: name 'test_nn_score' is not defined

In [65]:
from keras.utils.np_utils import to_categorical   


In [66]:
max_true = to_categorical(s_test_mj[:,0], num_classes = 2)

NameError: name 's_test_mj' is not defined

In [67]:
max_true = s_test_mj[:,0]

NameError: name 's_test_mj' is not defined

In [68]:
print(max_true)

NameError: name 'max_true' is not defined

In [69]:
from sklearn.metrics import roc_curve

In [70]:
print(max_true)

NameError: name 'max_true' is not defined

In [71]:
max_val = s_test_score[:,0]

NameError: name 's_test_score' is not defined

In [72]:
fpr, tpr, thresholds = roc_curve(max_true, max_val, pos_label=1)

NameError: name 'max_true' is not defined

In [73]:
plt.cla(); plt.clf()
fig = plt.figure()
fig.patch.set_facecolor('white')

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='NN')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
# plt.title('Simple NN ROC curve: classification of $\pi^+$ vs. $\pi^0$')
plt.legend(loc='best')
# plt.savefig('Plots/roc_combine2_cnn.pdf')
plt.show()

NameError: name 'fpr' is not defined