In [1]:
#!/usr/bin/env python
# encoding: utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
# Install TensorFlow
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten , Convolution2D, MaxPooling2D , Lambda, Conv2D, Activation,Concatenate
from tensorflow.keras.layers import ActivityRegularization
from tensorflow.keras.optimizers import Adam , SGD , Adagrad
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, CSVLogger, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import regularizers , initializers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import NumpyArrayIterator



gpus = tf.config.experimental.list_physical_devices('GPU')
# if gpus:
#   # Restrict TensorFlow to only use the first GPU
try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1000)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
    print(e)



from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
# from xgboost import XGBClassifier
import tensorflow.keras.backend as K
from sklearn import metrics

# !pip3 install keras-tuner --upgrade
# !pip3 install autokeras
import kerastuner as kt
import autokeras as ak

#Plot's Making  Packages
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, MultipleLocator, AutoMinorLocator
from matplotlib.colors import ListedColormap, LinearSegmentedColormap, BoundaryNorm
from matplotlib.collections import LineCollection
from matplotlib import cm
from matplotlib import font_manager


# Import local libraries
import numpy as np
import h5py
import time
import pandas as pd
import importlib
from scipy import interpolate
import os
from tqdm import tqdm 

import logging

importlib.reload(logging)
logging.basicConfig(level = logging.INFO)

os.environ['NUMEXPR_MAX_THREADS'] = '64'
os.environ['NUMEXPR_NUM_THREADS'] = '64'

print("Tensorflow Version is {}".format(tf.__version__))
print("Keras Version is {}".format(tf.keras.__version__))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
tf.device('/device:XLA_GPU:0')
!nvidia-smi

%config InlineBackend. figure_format = 'retina'

1 Physical GPUs, 1 Logical GPU
Tensorflow Version is 2.4.1
Keras Version is 2.4.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16582985004382457586
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1048576000
locality {
  bus_id: 2
  numa_node: 1
  links {
  }
}
incarnation: 4340440384154876200
physical_device_desc: "device: 0, name: A100-SXM-80GB, pci bus id: 0000:4c:00.0, compute capability: 8.0"
]
Tue Feb 22 15:05:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM-80GB       On 

In [2]:
filepath = "./KFold_CNN/"+str("herwig_ang")+"_KFold/CNN_"+str("herwig_ang")+"_Models_"+str(int(300))+str(int(500))+"/" + str("herwig_ang") + "_CNN_"+str(0)+ ".h5"
if os.path.exists(filepath):
#                     CNN_Model_A1[model] = load_model(filepath)

    model_CNN = load_model(filepath)
model_CNN.summary()

Model: "Model_CNN_herwig_ang_0"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv2D_1 (Conv2D)            (None, 96, 36, 36)        7296      
_________________________________________________________________
jet_MaxPooling_1 (MaxPooling (None, 48, 18, 36)        0         
_________________________________________________________________
Conv2D_2 (Conv2D)            (None, 32, 14, 32)        38432     
_________________________________________________________________
jet_MaxPooling_2 (MaxPooling (None, 32, 7, 16)         0         
_________________________________________________________________
jet_flatten (Flatten)        (None, 3584)              0         
_________________________________________________________________
jet_dense_1 (Dense)          (None, 350)               1254750   
_________________________________________________________________
jet_dense_2 (Dense)          (None, 400)    

In [7]:
def Loading_Data(data_source, datadict, start=0, stop=20000):
    x_jet, target = [], []

    time.sleep(0.5)
    for k in tqdm(range(start,len(data_source))):
        x_jet_path = savepath + "Image_Directory/"+ data_source["JetImage"].iloc[k]
        x_jet_tmp = np.load(x_jet_path)["jet_image"]
#         if np.isnan(x_jet_tmp).any() == True:
#             continue 
        x_jet_tmp = np.nan_to_num(x_jet_tmp)

        target.append(data_source["Y"].iloc[k])
        x_jet_tmp = np.divide((x_jet_tmp - Norm_dict[datadict][0]), (np.sqrt(Norm_dict[datadict][1])+1e-5))#[0].reshape(1,40,40)
        x_jet.append(x_jet_tmp)


        if k == stop:
            break

    return np.asarray(x_jet), np.asarray(target)

In [8]:
%%time
HOMEPATH = "/dicos_ui_home/alanchung/Universality_Boosetd_Higgs/"
Data_High_Level_Features_path =  HOMEPATH + "Data_High_Level_Features/"
savepath = HOMEPATH + "Data_ML/"



data_dict ={
            "herwig_ang" : [0,0],
            "pythia_def" : [0,0],
            "pythia_vin" : [0,0],
            "pythia_dip" : [0,0],
#             "sherpa_def" : [0,0],
          }  

Norm_dict ={
            "herwig_ang" : [0,0],
            "pythia_def" : [0,0],
            "pythia_vin" : [0,0],
            "pythia_dip" : [0,0],
#             "sherpa_def" : [0,0],
          }  

data_train = {
            "herwig_ang_train" : 0,
            "pythia_def_train" : 0,
            "pythia_vin_train" : 0,
            "pythia_dip_train" : 0,
#             "sherpa_def_train" : 0
            }  




CNN_Model_A1 = {
      "herwig_ang" : load_model("./KFold_CNN/herwig_ang_KFold/CNN_herwig_ang_Models_300500/herwig_ang_CNN_0.h5"),
      "pythia_def" : load_model("./KFold_CNN/pythia_def_KFold/CNN_pythia_def_Models_300500/pythia_def_CNN_0.h5"),
      "pythia_vin" : load_model("./KFold_CNN/pythia_vin_KFold/CNN_pythia_vin_Models_300500/pythia_vin_CNN_0.h5"),
      "pythia_dip" : load_model("./KFold_CNN/pythia_dip_KFold/CNN_pythia_dip_Models_300500/pythia_dip_CNN_0.h5"),
    }

       
    
pd_datafram_save = {
            "herwig_ang_train" : pd.DataFrame(),
            "pythia_def_train" : pd.DataFrame(),
            "pythia_vin_train" : pd.DataFrame(),
            "pythia_dip_train" : pd.DataFrame(),
            }  


for j, element in enumerate(data_dict):
    data_dict[element][0] = pd.read_csv(savepath + str(element) + "_H_dict.csv")
    data_dict[element][1] = pd.read_csv(savepath + str(element) + "_QCD_dict.csv")
#         logging.info(len(data_dict[element][0]),len(data_dict[element][1]))

for j, element in enumerate(Norm_dict):
    average_H = np.load(savepath + "average" + "_" + str(element) + "_H.npy")
    variance_H = np.load(savepath + "variance" + "_" + str(element) + "_H.npy")
    average_QCD = np.load(savepath + "average" + "_" + str(element) + "_QCD.npy")
    variance_QCD = np.load(savepath + "variance" + "_" + str(element) + "_QCD.npy")
    length_H = len(data_dict[element][0])
    length_QCD = len(data_dict[element][1])

    Norm_dict[element][0] = (average_H*length_H + average_QCD*length_QCD)/(length_H+length_QCD)
    Norm_dict[element][1] =  variance_H + variance_QCD

for j, (traindata, datadict) in enumerate(zip(data_train, data_dict)):

    
    train_data_path = HOMEPATH + "Notebook/KFold_CNN/" + str(traindata) + ".csv"
    if os.path.exists(train_data_path):
        tmp = pd.read_csv(train_data_path)
    else:
        raise ValueError("Pleas check training data path !!")

    tmp = tmp[(tmp["PTJ_0"] >= 300)  & (tmp["PTJ_0"] < 500)]
    tmp = tmp[(tmp["MJ_0"] >= 110)  & (tmp["MJ_0"] < 160)]
    data_train[traindata] = tmp#[:30000]

    H_tmp = data_train[traindata][data_train[traindata]["target"] == 1]
    QCD_tmp = data_train[traindata][data_train[traindata]["target"] == 0]

    H_dict = data_dict[datadict][0].iloc[H_tmp["index"].values]
    QCD_dict = data_dict[datadict][1].iloc[QCD_tmp["index"].values]

    data_train[traindata] = pd.concat([H_dict, QCD_dict], ignore_index=True, axis=0,join='inner')
    data_train[traindata] = data_train[traindata]


    logging.info("START===========================================START")
    logging.info("\r") 
    logging.info("All Files are loaded!")
    logging.info("pt min: {} , pt max: {}".format(300, 500))
    logging.info("\r")
    logging.info("H jet : QCD jet = 1 : 1")
    logging.info("{}, # of H jet: {}".format(traindata, len(H_dict)))
    logging.info("{}, # of QCD jet: {}".format(traindata, len(QCD_dict)))
    train = len(data_train[traindata])
    logging.info("{:^8}{:^15}".format("Train #",train))


    x_jet, target = Loading_Data(data_train[traindata], datadict, start=0, stop= len(data_train[traindata]))

    logging.info("Test Data: {}".format(traindata))
    time.sleep(0.5)

    for i, model in enumerate(CNN_Model_A1): 
        
        prediction = CNN_Model_A1[model].predict(np.asarray(x_jet))
        pd_datafram_save[traindata][model+"_cnn_pre"] = prediction.reshape(len(prediction),)

    logging.info("{} CNN models apply on {} is finished!!".format(model,traindata))
    logging.info("END===========================================END")
    logging.info("\n")


logging.info("\n")
logging.info("\n")




INFO:root:
INFO:root:All Files are loaded!
INFO:root:pt min: 300 , pt max: 500
INFO:root:
INFO:root:H jet : QCD jet = 1 : 1
INFO:root:herwig_ang_train, # of H jet: 153728
INFO:root:herwig_ang_train, # of QCD jet: 153728
INFO:root:Train #     307456     
100%|██████████| 307456/307456 [06:24<00:00, 799.71it/s]
INFO:root:Test Data: herwig_ang_train
INFO:root:pythia_dip CNN models apply on herwig_ang_train is finished!!
INFO:root:

INFO:root:
INFO:root:All Files are loaded!
INFO:root:pt min: 300 , pt max: 500
INFO:root:
INFO:root:H jet : QCD jet = 1 : 1
INFO:root:pythia_def_train, # of H jet: 153728
INFO:root:pythia_def_train, # of QCD jet: 153728
INFO:root:Train #     307456     
100%|██████████| 307456/307456 [06:07<00:00, 836.67it/s]
INFO:root:Test Data: pythia_def_train
INFO:root:pythia_dip CNN models apply on pythia_def_train is finished!!
INFO:root:

INFO:root:
INFO:root:All Files are loaded!
INFO:root:pt min: 300 , pt max: 500
INFO:root:
INFO:root:H jet : QCD jet = 1 : 1
INFO:root:

CPU times: user 18min 1s, sys: 6min 37s, total: 24min 39s
Wall time: 29min 20s


In [9]:
for element in pd_datafram_save:
    pd_datafram_save[element].to_csv("./"+element+"_CNN.csv", index = 0)