In [1]:
#!/usr/bin/env python
# encoding: utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
# Install TensorFlow
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten , Convolution2D, MaxPooling2D , Lambda, Conv2D, Activation,Concatenate
from tensorflow.keras.layers import ActivityRegularization
from tensorflow.keras.optimizers import Adam , SGD , Adagrad
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, CSVLogger, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers , initializers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import NumpyArrayIterator



from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold
# from xgboost import XGBClassifier
import tensorflow.keras.backend as K
from sklearn import metrics

# !pip3 install keras-tuner --upgrade
# !pip3 install autokeras
import kerastuner as kt
import autokeras as ak

# Import local libraries
import numpy as np
import matplotlib.pyplot as plt
import time
import pandas as pd
import importlib
import os
import sys
import logging

importlib.reload(logging)
logging.basicConfig(level = logging.INFO)

os.environ['NUMEXPR_MAX_THREADS'] = '64'
os.environ['NUMEXPR_NUM_THREADS'] = '64'


gpus = tf.config.experimental.list_physical_devices('GPU')
# if gpus:
#   # Restrict TensorFlow to only use the first GPU
try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
    logging.info(e)






logging.info("Tensorflow Version is {}".format(tf.__version__))
logging.info("Keras Version is {}".format(tf.keras.__version__))
from tensorflow.python.client import device_lib
logging.info(device_lib.list_local_devices())
tf.device('/device:XLA_GPU:0')

!nvidia-smi

INFO:root:Tensorflow Version is 2.4.1
INFO:root:Keras Version is 2.4.0
INFO:root:[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14115177251302629231
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4194304000
locality {
  bus_id: 2
  numa_node: 1
  links {
  }
}
incarnation: 15681413099908745949
physical_device_desc: "device: 0, name: A100-SXM-80GB, pci bus id: 0000:48:00.0, compute capability: 8.0"
]


1 Physical GPUs, 1 Logical GPU
Sat Dec 11 03:36:03 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM-80GB       On   | 00000000:48:00.0 Off |                    0 |
| N/A   33C    P0    95W / 400W |  54723MiB / 81251MiB |     99%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------------------------------------

Ref: https://keras.io/keras_tuner/#quick-introduction  
Ref: https://keras.io/guides/keras_tuner/getting_started/   
Ref: https://keras.io/api/keras_tuner/tuners/base_tuner/

In [3]:
model = load_model("./DNN_Model_Hyper_Tuning/Universality/Tuning.h5", compile=False)

In [4]:
model.summary()

Model: "Model_DNN_Pythia_Default"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 608)               4256      
_________________________________________________________________
dense_1 (Dense)              (None, 224)               136416    
_________________________________________________________________
dense_2 (Dense)              (None, 928)               208800    
_________________________________________________________________
dense_3 (Dense)              (None, 288)               267552    
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              295936    
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)    

In [2]:
%%time

HOMEPATH = "/dicos_ui_home/alanchung/Universality_Boosetd_Higgs/"
# Data_High_Level_Features_path =  HOMEPATH + "Data_High_Level_Features/"
# savepath = HOMEPATH + "Data_ML/"

try:
    
    data_train = {
#             "herwig_ang_train" : 0,
            "pythia_def_train" : 0,
#             "pythia_vin_train" : 0,
#             "pythia_dip_train" : 0,
#             "sherpa_def_train" : 0
            }  
    
    for i, element in enumerate(data_train):
#         data_train[element] = pd.read_csv(savepath + "BDT/" + str(element) + ".csv")
    
        """
        Pt Range Study
        """
        pt_min, pt_max = 300, 500
        tmp = pd.read_csv(HOMEPATH + "Notebook/KFold/" + str(element) + ".csv")
        tmp = tmp[(tmp["PTJ_0"] >= pt_min)  & (tmp["PTJ_0"] < pt_max)]
        tmp = tmp[(tmp["MJ_0"] >= 110)  & (tmp["MJ_0"] < 160)]
        data_train[element] = shuffle(tmp)
    
    
    
    logging.info("All Files are loaded!")

    logging.info("H jet : QCD jet = 1 : 1")
    
    for i, element in enumerate(data_train):
        
        logging.info("{}, # of H jet: {}".format(element, len(data_train[element][ data_train[element]["PRO"] == "H"])))
        logging.info("{}, # of QCD jet: {}".format(element, len(data_train[element][ data_train[element]["PRO"] == "QCD"])))
        
    logging.info("\r")
    
    
    train = [ len(data_train[element]) for j, element in enumerate(data_train)]
    logging.info("{:^8}{:^15}".format("",str(element)))
    logging.info("{:^8}{:^15}".format("Train #",train[0]))


    for i, element in enumerate(data_train):
        total_list = data_train[element].columns
        break
    
    logging.info("total_list: {}".format(total_list))

except:
    
    logging.info("Please create training, test and validation datasets.")
    raise ValueError("Please create training, test and validation datasets.")
    
    
    

INFO:root:All Files are loaded!
INFO:root:H jet : QCD jet = 1 : 1
INFO:root:pythia_def_train, # of H jet: 153728
INFO:root:pythia_def_train, # of QCD jet: 153728
INFO:root:
INFO:root:        pythia_def_train
INFO:root:Train #     307456     
INFO:root:total_list: Index(['GEN', 'SHO', 'PRO', 'MJ_0', 'PTJ_0', 'eta_0', 'phi_0', 't21_0',
       'D21_0', 'D22_0', 'C21_0', 'C22_0', 'MJ', 'PTJ', 'eta', 'phi', 't21',
       'D21', 'D22', 'C21', 'C22', 'weight', 'eventindex', 'WEIGHT', 'index',
       'target'],
      dtype='object')


CPU times: user 939 ms, sys: 162 ms, total: 1.1 s
Wall time: 1.1 s


In [3]:
%%time

features = ["MJ_0","t21_0","D21_0","D22_0","C21_0","C22_0"]

length = len(data_train["pythia_def_train"])
training_data = data_train["pythia_def_train"][:int(length/10*8)]
validation_data = data_train["pythia_def_train"][int(length/10*8):int(length/10*9)]
test_data = data_train["pythia_def_train"][int(length/10*9):]

logging.info("training length: {}".format(len(training_data)))
logging.info("validation length: {}".format(len(validation_data)))
logging.info("test length: {}".format(len(test_data)))
logging.info("Total length: {}".format(len(training_data)+len(validation_data)+len(test_data)))
logging.info("Total length: {}".format(length))



INFO:root:training length: 245964
INFO:root:validation length: 30746
INFO:root:test length: 30746
INFO:root:Total length: 307456
INFO:root:Total length: 307456


CPU times: user 1.15 ms, sys: 1.6 ms, total: 2.75 ms
Wall time: 2.03 ms


In [26]:
def DNN_Model(hp):
    
    model_DNN = Sequential(name = "Model_DNN_Pythia_Default")
    model_DNN.add(keras.Input(shape=(len(features),), name = 'input'))
#     model_DNN.add(keras.layers.Dense(hp.Choice('units', [8, 16, 32]), activation='relu', name = 'dense_1'))
    for i in range(hp.Int("num_layers", 1, 6)):
        model_DNN.add(
            keras.layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i}", min_value=32, max_value=1024, step=32),
#                 activation=hp.Choice("activation", ["relu", "tanh"]),
                activation='relu',
            )
        )
        
    if hp.Boolean("dropout"):
        model_DNN.add(keras.layers.Dropout(rate=0.01))
        
    model_DNN.add(Dense(1, activation='sigmoid'))
    
    learning_rate = hp.Float("lr", min_value=1e-5, max_value=1e-2, sampling="log")
    # model_opt = keras.optimizers.Adadelta()
    model_opt = keras.optimizers.Adam(learning_rate=learning_rate)
    model_DNN.compile(loss="binary_crossentropy",#keras.losses.binary_crossentropy
                              optimizer=model_opt,
                              metrics=['accuracy'])

#     model_DNN.summary()

    return model_DNN



In [27]:
%%time
tuner = kt.RandomSearch(hypermodel=DNN_Model,
                        objective="val_loss",
                        max_trials=3,
                        executions_per_trial=2, #The number of models that should be built and fit for each trial
                        overwrite=True,
                        directory="DNN_Model_Hyper_Tunning",
                        project_name="Universality"
                        )
tuner.search_space_summary()

Search space summary
Default search space size: 4
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 6, 'step': 1, 'sampling': None}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 1024, 'step': 32, 'sampling': None}
dropout (Boolean)
{'default': False, 'conditions': []}
lr (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
CPU times: user 147 ms, sys: 156 µs, total: 147 ms
Wall time: 147 ms


In [28]:
tuner.search(training_data[features], np.asarray(training_data["target"]), 
             epochs=5, 
             validation_data=(validation_data[features], np.asarray(validation_data["target"])))

Trial 3 Complete [00h 01m 49s]
val_loss: 0.4983366131782532

Best val_loss So Far: 0.4983366131782532
Total elapsed time: 00h 06m 27s
INFO:tensorflow:Oracle triggered exit


In [29]:
tuner.results_summary()

Results summary
Results in DNN_Model_Hyper_Tunning/Universality
Showing 10 best trials
Objective(name='val_loss', direction='min')
Trial summary
Hyperparameters:
num_layers: 1
units_0: 288
dropout: False
lr: 0.001856992770979149
units_1: 160
units_2: 96
units_3: 672
units_4: 864
Score: 0.4983366131782532
Trial summary
Hyperparameters:
num_layers: 4
units_0: 768
dropout: True
lr: 0.00020945278125419103
units_1: 32
units_2: 32
units_3: 32
Score: 0.5037332475185394
Trial summary
Hyperparameters:
num_layers: 5
units_0: 896
dropout: True
lr: 0.003762672112568635
units_1: 768
units_2: 448
units_3: 992
units_4: 32
Score: 0.5941602736711502


In [65]:
best_model = tuner.get_best_models(num_models=3)[0]
best_model.summary()

Model: "Model_DNN_Pythia_Default"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 288)               2016      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 289       
Total params: 2,305
Trainable params: 2,305
Non-trainable params: 0
_________________________________________________________________


In [66]:
prediction_test =  best_model.predict(np.asarray(test_data[features]))
discriminator_test = prediction_test
discriminator_test = discriminator_test/(max(discriminator_test))

In [63]:
Performance_Frame = {
                "AUC" : [0],
                "max_sig" : [0],
                "r05" : [0],
                }
Performance_Frame["AUC"][0] = metrics.roc_auc_score(test_data["target"],discriminator_test)
FalsePositiveFull, TruePositiveFull, _ = metrics.roc_curve(test_data["target"],discriminator_test)
tmp = np.where(FalsePositiveFull != 0)
Performance_Frame["max_sig"][0] = max(TruePositiveFull[tmp]/np.sqrt(FalsePositiveFull[tmp])) 
tmp = np.where(TruePositiveFull >= 0.5)
Performance_Frame["r05"][0]= 1./FalsePositiveFull[tmp[0][0]]


dataframe = pd.DataFrame(Performance_Frame)
dataframe

Unnamed: 0,AUC,max_sig,r05
0,0.831514,1.924599,12.696443


In [33]:
best_model.save("./Tuning.h5")
model = load_model("./Tuning.h5", compile =False)
model.summary()

Model: "Model_DNN_Pythia_Default"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 288)               2016      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 289       
Total params: 2,305
Trainable params: 2,305
Non-trainable params: 0
_________________________________________________________________


In [34]:
prediction_test =  model.predict(np.asarray(test_data[features]))
discriminator_test = prediction_test
discriminator_test = discriminator_test/(max(discriminator_test))
Performance_Frame = {
                "AUC" : [0],
                "max_sig" : [0],
                "r05" : [0],
                }
Performance_Frame["AUC"][0] = metrics.roc_auc_score(test_data["target"],discriminator_test)
FalsePositiveFull, TruePositiveFull, _ = metrics.roc_curve(test_data["target"],discriminator_test)
tmp = np.where(FalsePositiveFull != 0)
Performance_Frame["max_sig"][0] = max(TruePositiveFull[tmp]/np.sqrt(FalsePositiveFull[tmp])) 
tmp = np.where(TruePositiveFull >= 0.5)
Performance_Frame["r05"][0]= 1./FalsePositiveFull[tmp[0][0]]


dataframe = pd.DataFrame(Performance_Frame)
dataframe

Unnamed: 0,AUC,max_sig,r05
0,0.831514,1.924599,12.696443
