## Traininig the Inclusive classifier with tf.keras using data in Parquet format with Petastorm

**tf.keras Inclusive classifier, GRU-based model** This notebooks trains a neural network for the particle classifier using the Inclusive Classifier, using as input the full list of recunstructed particles and the High Level Features. Data is prepared in Parquet and ingested via Petastorm. Tensorflow data processing uses tf.data.  

Credits: this notebook is part of the work: 
- [Machine Learning Pipelines with Modern Big Data Tools for High Energy Physics Comput Softw Big Sci 4, 8 (2020)](https://rdcu.be/b4Wk9)  
- Code and data at:https://github.com/cerndb/SparkDLTrigger  

The model is a classifier implemented as the concatenation of a Dense Neural Network and a Recurrent Neural Network (GRU)
 - input: 14 high-level features and an array of 801 particles with 19 low-level features, described in [ Topology classification with deep learning to improve real-time event selection at the LHC](https://link.springer.com/epdf/10.1007/s41781-019-0028-1?author_access_token=eTrqfrCuFIP2vF4nDLnFfPe4RwlQNchNByi7wbcMAY7NPT1w8XxcX1ECT83E92HWx9dJzh9T9_y5Vfi9oc80ZXe7hp7PAj21GjdEF2hlNWXYAkFiNn--k5gFtNRj6avm0UukUt9M9hAH_j4UR7eR-g%3D%3D)
 - output: 3 classes, "W + jet", "QCD", "t tbar", see also [Machine Learning Pipelines with Modern Big Data Tools for High Energy Physics Comput Softw Big Sci 4, 8 (2020)](https://rdcu.be/b4Wk9)  
 - Open dataset: [download data](https://github.com/cerndb/SparkDLTrigger/tree/master/Data)


## Create the Keras model for the inclusive classifier

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential, Input, Model
from tensorflow.keras.layers import Masking, Dense, Activation, GRU, Dropout, concatenate

In [None]:
tf.version.VERSION

In [None]:
# Check that we have a GPU available
tf.config.list_physical_devices('GPU')

In [None]:
## LSTM branch
gru_input = Input(shape=(801,19), name='gru_input')
a = gru_input
a = Masking(mask_value=0.)(a)
a = GRU(units=50,activation='tanh')(a)
gruBranch = Dropout(0.2)(a)

In [None]:
hlf_input = Input(shape=(14,), name='hlf_input')
b = hlf_input
hlfBranch = Dropout(0.2)(b)

In [None]:
c = concatenate([gruBranch, hlfBranch])
c = Dense(25, activation='relu')(c)
output = Dense(3, activation='softmax')(c)

In [None]:
model = Model(inputs=[gru_input, hlf_input], outputs=output)

In [None]:
## Compile model
optimizer = 'Adam'
loss = 'categorical_crossentropy'
model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"] )

In [None]:
model.summary()

## Load test and training data in Parquet format, using Petastorm

In [None]:
# Download the datasets from 
# ** https://github.com/cerndb/SparkDLTrigger/tree/master/Data **
#
# For CERN users, data is already available on EOS
PATH = "file:///eos/project/s/sparkdltrigger/public/"

file_train_dataset = PATH + "trainUndersampled_InclusiveClassifier.parquet"
file_test_dataset = PATH + "testUndersampled_InclusiveClassifier.parquet"

# PATH needs to be 
# "file://<full_path>_on_filesystem/Parquet_folder/"
# "hdfs://<full_path_on_hdfs>/Parquet_folder/"


In [None]:
# We use the petastorm libary to load and feed the training and test data in Parquet format
# It makes use TensorFLow tf.data.dataset

import petastorm
from petastorm import make_batch_reader
from petastorm.tf_utils import make_petastorm_dataset

petastorm.__version__

## Train the tf.keras model feeding data with Petastorm

In [None]:
#
# Train with TensorFlow using Petastorm to read Parquet files
# This performs a rebatching operation on the training dataset  to set explicitly the bach size,
# as otherwise Petastorm produces batches with Parquet rowgroup size, which is often too large. 
# 

batch_size = 128

with make_batch_reader(file_test_dataset, num_epochs = 1, shuffle_row_groups = False) as test_data:
    with make_batch_reader(file_train_dataset, num_epochs = 1, shuffle_row_groups = False) as train_data:
        # print("Number of training rows:", train_data.dataset.read().num_rows)
        #
        # Transform Parquet files into TensorFlow datasets (tf.data API)
        #
        test_dataset = ( make_petastorm_dataset(test_data)
                            .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
                            .map(lambda x: ((tf.reshape(x.GRU_input, [-1, 801, 19]), x.HLF_input), x.encoded_label))
                       )
        # use for debug
        # for record in test_dataset.take(1):
        #     print(record)
        train_dataset = ( make_petastorm_dataset(train_data)
                            .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
                            .map(lambda x: ((tf.reshape(x.GRU_input, [-1, 801, 19]), x.HLF_input), x.encoded_label))
                            .unbatch()  # change this for rebatch with tensorflow 2.11
                            .batch(batch_size)
                        )       
        #
        # Train the Keras model
        #
        num_epochs = 6
        %time history = model.fit(train_dataset, validation_data = test_dataset, \
                                  epochs = num_epochs, verbose=1)                               


In [None]:
# Save the model
# tf.keras.models.save_model(model, PATH+"mymodel" + ".tf", save_format='tf')

## Performance metrics

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt 
plt.style.use('seaborn-darkgrid')
# Graph with loss vs. epoch

plt.figure()
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(loc='upper right')
plt.title("HLF classifier loss")
plt.show()

In [None]:
# Graph with accuracy vs. epoch
%matplotlib notebook
plt.figure()
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validation')
plt.ylabel('Accuracy')
plt.xlabel('epoch')
plt.legend(loc='lower right')
plt.title("HLF classifier accuracy")
plt.show()

## Confusion Matrix

In [None]:
# model = tf.keras.models.load_model("./mymodel.tf")

In [None]:
import numpy as np

# Need to use workers_count=1 to avoid getting data potentially in different order at each execution
with make_batch_reader(file_test_dataset, num_epochs = 1, workers_count=1, shuffle_row_groups = False, shuffle_rows=False) as test_data:
    y_pred = model.predict(test_data)

In [None]:
# Need to use workers_count=1 to avoid getting data potentially in different order at each execution
with make_batch_reader(file_test_dataset, num_epochs = 1, workers_count=1, shuffle_row_groups = False, shuffle_rows=False) as test_data:
    y_true = np.concatenate([labels for features,labels in test_data])


In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy of the classifier: {:.4f}'.format(
    accuracy_score(np.argmax(y_true, axis=1),np.argmax(y_pred, axis=1))))

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
labels_name = ['qcd', 'tt', 'wjets']
labels = [0,1,2]

cm = confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), labels=labels)

## Normalize CM
cm = cm / cm.astype(float).sum(axis=1)

fig, ax = plt.subplots()
ax = sns.heatmap(cm, annot=True, fmt='g')
ax.xaxis.set_ticklabels(labels_name)
ax.yaxis.set_ticklabels(labels_name)
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

## ROC and AUC

In [None]:
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
# Dictionary containign ROC-AUC for the three classes 
roc_auc

In [None]:
%matplotlib notebook

# Plot roc curve 
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')


plt.figure()
plt.plot(fpr[0], tpr[0], lw=2, 
         label='HLF classifier (AUC) = %0.4f' % roc_auc[0])
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.show()