In [1]:
%matplotlib inline
import dgl
import glob
import pprint
import numpy as np
import awkward as ak
import networkx as nx
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from os import path
from tqdm import tqdm
from pathlib import Path
from trainresults import TrainResults
from train_eval_func import train, evaluate
from copy import deepcopy
from dgl.data import DGLDataset
from dgl.dataloading import GraphDataLoader
from TauGraphDatasetInfo import TauGraphDatasetInfo
from TauGraphDataset import TauGraphDataset, GetNodeFeatureVectors, GetEdgeFeatureVectors
from TauGraphDataset import GetNeighborNodes, GetEdgeList, GetEdgeFeatureVectorsFromSourceNode, Graph2FlatZeropaddedList

plt.rcParams.update({'font.size': 20})
plt.rcParams['text.usetex'] = True
lw = 2
xyLabelFontSize = 20
xLabelPad = 10
yLabelPad = 15
pp = pprint.PrettyPrinter()

Using backend: pytorch


In [2]:
def getDatasetNames(datasetDir):
    files = glob.glob(datasetDir + '/*.json', recursive=True)
    files.sort()
    datasetDirectories = [path.dirname(file) for file in files]
    datasetnames = [path.normpath(dir).split(path.sep)[-1] for dir in datasetDirectories]
    return datasetDirectories, datasetnames

In [3]:
datasetDir = '/ceph/aissac/ntuple_for_graphs/prod_2018_v2_processed_v5_THESIS/trimmed_200000_and_cut_puppiWeightNoLep_greater_0_and_deltaR_smaller_0point5/Graphs_DYJetsToLL_M-50_genuineTaus_and_jets'
datasetDirs, datasetNames = getDatasetNames(datasetDir)
print(datasetDirs)
print(datasetNames)

['/ceph/aissac/ntuple_for_graphs/prod_2018_v2_processed_v5_THESIS/trimmed_200000_and_cut_puppiWeightNoLep_greater_0_and_deltaR_smaller_0point5/Graphs_DYJetsToLL_M-50_genuineTaus_and_jets']
['Graphs_DYJetsToLL_M-50_genuineTaus_and_jets']


In [4]:
dataset = TauGraphDataset(datasetNames[0], datasetDirs[0])
print(dataset)

Done loading data from cached files.
<TauGraphDataset.TauGraphDataset object at 0x7fb3291c5b38>


In [5]:
print(f'name: {datasetNames[0]},\n directory: {datasetDirs[0]}')
graph, label = dataset[0]
print(graph)
print(f'label: {label}')
print(f'graph classes: {dataset.graphClasses}')
print(f'dataset graph count: {dataset.num_graphs}')
print(f'nodeFeatKeys: {dataset.nodeFeatKeys}')
print(f'edgeFeatKeys: {dataset.edgeFeatKeys}')
print(f'graphFeatkeys: {dataset.graphFeatKeys}')
print(f'max node count: {dataset.maxNodeCount}')
print(f'min node count: {dataset.minNodeCount}')

name: Graphs_DYJetsToLL_M-50_genuineTaus_and_jets,
 directory: /ceph/aissac/ntuple_for_graphs/prod_2018_v2_processed_v5_THESIS/trimmed_200000_and_cut_puppiWeightNoLep_greater_0_and_deltaR_smaller_0point5/Graphs_DYJetsToLL_M-50_genuineTaus_and_jets
Graph(num_nodes=25, num_edges=600,
      ndata_schemes={'feat': Scheme(shape=(7,), dtype=torch.float64)}
      edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float32)})
label: 1
graph classes: ['0', '1']
dataset graph count: 200000
nodeFeatKeys: ['pt', 'eta', 'phi', 'mass', 'charge', 'particleType', 'summand']
edgeFeatKeys: ['deltaEta', 'deltaPhi', 'deltaR']
graphFeatkeys: ['nodeCount', 'tau_byDeepTau2017v2p1VSjet']
max node count: 81
min node count: 2


In [7]:
graphs, labels = dataset[:]
g = graphs[0]
nFeatDim = dataset.dim_nfeats
eFeatDim = dataset.dim_efeats
maxNodeCount = dataset.maxNodeCount
print(g)
print(f'nFeatDim: {nFeatDim}')
print(f'eFeatDim: {eFeatDim}')
print(f'maxNodeCount: {maxNodeCount}')
print()

nodeAndEdgeFeaturePaddedDim = nFeatDim + eFeatDim * (maxNodeCount - 1)
print(f'node + edge features dim per Node (includes zero padding if nodecount<maxnodecount):\n',
      f'nFeatDim + eFeatDim * (maxNodeCount-1) = {nFeatDim} + {eFeatDim} * {maxNodeCount - 1} = ',
      f'{nodeAndEdgeFeaturePaddedDim}')
print(f'The (maxNodeCount-1) comes from fully connected graphs without self-loops')
print()
print(f'only node features dim per Node: nFeatDim={nFeatDim}')
print()

nodeAndEdgeFeaturePaddedDimInputSize = nodeAndEdgeFeaturePaddedDim * maxNodeCount
nodeFeaturePaddedDimInputSize = nFeatDim * maxNodeCount
print(f'node + edge features with zero padding to fill until maxNodeCount leads to inputsize: {nodeAndEdgeFeaturePaddedDimInputSize}')
useEdgeFeat = True
temp = np.array(Graph2FlatZeropaddedList(g, nFeatDim, eFeatDim, maxNodeCount, useEdgeFeat), dtype=np.float32)
print(f'check example: node + edge features list size: {len(temp)}')
print()
print(f'only node features with zero padding to fill until maxNodeCount leads to inputsize: {nodeFeaturePaddedDimInputSize}')
useEdgeFeat = False
temp = np.array(Graph2FlatZeropaddedList(g, nFeatDim, eFeatDim, maxNodeCount, useEdgeFeat), dtype=np.float32)
print(f'check example: only node features list size: {len(temp)}')

Graph(num_nodes=25, num_edges=600,
      ndata_schemes={'feat': Scheme(shape=(7,), dtype=torch.float64)}
      edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float32)})
nFeatDim: 7
eFeatDim: 3
maxNodeCount: 81

node + edge features dim per Node (includes zero padding if nodecount<maxnodecount):
 nFeatDim + eFeatDim * (maxNodeCount-1) = 7 + 3 * 80 =  247
The (maxNodeCount-1) comes from fully connected graphs without self-loops

only node features dim per Node: nFeatDim=7

node + edge features with zero padding to fill until maxNodeCount leads to inputsize: 20007
check example: node + edge features list size: 20007

only node features with zero padding to fill until maxNodeCount leads to inputsize: 567
check example: only node features list size: 567


In [8]:
def getInputData(dgldataset, useEdgeFeatures):
    inputs = [] 
    graphs, labels = dgldataset[:]
    maxNodeCount = dgldataset.maxNodeCount
    nFeatDim = dgldataset.dim_nfeats
    eFeatDim = dgldataset.dim_efeats
    
    import time
    start = time.time()
    it = 0
    
    for i in tqdm(range(len(graphs))):
        inputs.append(Graph2FlatZeropaddedList(graphs[i], nFeatDim, eFeatDim, maxNodeCount, useEdgeFeatures))

    # Stack all inputs_ vertically
    print(type(inputs))
    inputs = np.array(inputs, dtype=np.float32)
    print("before vstack - Input shape: ", inputs.shape)
    print(inputs)
    print(type(inputs))
    inputs = np.vstack(inputs)
    print(inputs)
    

    # Stack all labels_ horizontally
    labels = np.hstack(labels)

    print("Input shape: ", inputs.shape)
    print("Labels shape: ", labels.shape)

    labels = tf.keras.utils.to_categorical(labels)
    print(labels.shape)
    print(labels[0])
    end = time.time() - start
    print(f'graphs to flattened zero padded list took {end:.2f} seconds ({end/60:.2f} minutes)')
    return inputs, labels

In [9]:
inputs, labels = getInputData(dataset, True)

100%|██████████| 200000/200000 [14:22<00:00, 231.97it/s]


<class 'list'>
before vstack - Input shape:  (200000, 20007)
[[ 0.6791992   1.238197   -0.6319463  ...  0.          0.
   0.        ]
 [ 2.7011719  -0.5762505   2.2737963  ...  0.          0.
   0.        ]
 [ 4.2695312  -0.781518    0.42511293 ...  0.          0.
   0.        ]
 ...
 [ 0.66308594 -0.8608051   1.7206644  ...  0.          0.
   0.        ]
 [ 1.4853516   1.7232581  -3.10292    ...  0.          0.
   0.        ]
 [ 0.8852539   1.0530717  -2.2728148  ...  0.          0.
   0.        ]]
<class 'numpy.ndarray'>
[[ 0.6791992   1.238197   -0.6319463  ...  0.          0.
   0.        ]
 [ 2.7011719  -0.5762505   2.2737963  ...  0.          0.
   0.        ]
 [ 4.2695312  -0.781518    0.42511293 ...  0.          0.
   0.        ]
 ...
 [ 0.66308594 -0.8608051   1.7206644  ...  0.          0.
   0.        ]
 [ 1.4853516   1.7232581  -3.10292    ...  0.          0.
   0.        ]
 [ 0.8852539   1.0530717  -2.2728148  ...  0.          0.
   0.        ]]
Input shape:  (200000, 2000

In [9]:
def datagenerator(inputs, labels, batchsize):
    while True:
        start = 0
        end = batchsize

        while start  < len(inputs): 
            # load your images from numpy arrays or read from directory
            x = inputs[start:end] 
            y = labels[start:end]
            yield x, y

            start += batchsize
            end += batchsize

In [10]:
from tensorflow import keras

outputFolder = path.join(datasetDir, 'Output_Keras_NodeAndEdgeFeat')
Path(outputFolder).mkdir(parents=True, exist_ok=True)

tf.keras.backend.clear_session()
model = keras.Sequential(name="KerasModel_NodeAndEdgeFeat")
inputDim = (nFeatDim + eFeatDim * (maxNodeCount - 1)) * maxNodeCount
model.add(keras.layers.InputLayer(input_shape=(inputDim,), name="input"))
model.add(keras.layers.Dense(32, activation='relu', name="dense1"))
model.add(keras.layers.Dense(16*16, activation='relu', name="dense2"))
model.add(keras.layers.Dense(16, activation='relu', name="dense3"))
model.add(keras.layers.Dense(2, activation='softmax', name="output"))
model.summary()

lossfunction = keras.losses.CategoricalCrossentropy()
optimizer = keras.optimizers.Adam(learning_rate=0.001)
#earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, min_delta=0.0005)
modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(filepath=path.join(outputFolder,'keras_nodeAndEdgeFeat_bestmodel.h5'), monitor='val_loss', save_best_only=True, verbose=1)
csvlogger = tf.keras.callbacks.CSVLogger(filename=path.join(outputFolder, 'results_keras_nodeAndEdgeFeat_bestmodel.csv'), separator=',', append=False)
#callbacks = [earlystopping, modelcheckpoint, csvlogger]
callbacks = [modelcheckpoint, csvlogger]

model.summary()
model.compile(optimizer=optimizer, loss=lossfunction, metrics=['accuracy'])

Model: "KerasModel_NodeAndEdgeFeat"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense1 (Dense)               (None, 32)                640256    
_________________________________________________________________
dense2 (Dense)               (None, 256)               8448      
_________________________________________________________________
dense3 (Dense)               (None, 16)                4112      
_________________________________________________________________
output (Dense)               (None, 2)                 34        
Total params: 652,850
Trainable params: 652,850
Non-trainable params: 0
_________________________________________________________________
Model: "KerasModel_NodeAndEdgeFeat"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense1 (Dense)               (None, 32)                640256   

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_testAndVal, y_train, y_testAndVal = train_test_split(inputs, labels, test_size=0.3, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_testAndVal, y_testAndVal, test_size=0.33, shuffle=False)
print(f'train samples: {len(X_train)}')
print(f'validation samples: {len(X_val)}')
print(f'test samples: {len(X_test)}')

train samples: 140000
validation samples: 40200
test samples: 19800


In [12]:
batchsize=1024

import time
start = time.time()

history = model.fit(
    x = datagenerator(X_train, y_train, batchsize=batchsize),
    validation_data = datagenerator(X_val, y_val,batchsize=batchsize),
    steps_per_epoch = len(X_train)//batchsize,
    validation_steps = len(X_val)//batchsize,
    shuffle=False, # at creation from rootfiles -> graphs already shuffled dataset
    epochs = 30, # doesnt matter, since we use early stopping
    callbacks = callbacks
)

end = time.time() - start
print(f'training took {end:.2f} seconds ({end/60:.2f} minutes)')

Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.24803, saving model to /ceph/aissac/ntuple_for_graphs/prod_2018_v2_processed_v5_THESIS/trimmed_200000_and_cut_puppiWeightNoLep_greater_0_and_deltaR_smaller_0point5/Graphs_DYJetsToLL_M-50_genuineTaus_and_jets/Output_Keras_NodeAndEdgeFeat/keras_nodeAndEdgeFeat_bestmodel.h5
Epoch 2/30

Epoch 00002: val_loss improved from 0.24803 to 0.24141, saving model to /ceph/aissac/ntuple_for_graphs/prod_2018_v2_processed_v5_THESIS/trimmed_200000_and_cut_puppiWeightNoLep_greater_0_and_deltaR_smaller_0point5/Graphs_DYJetsToLL_M-50_genuineTaus_and_jets/Output_Keras_NodeAndEdgeFeat/keras_nodeAndEdgeFeat_bestmodel.h5
Epoch 3/30

Epoch 00003: val_loss improved from 0.24141 to 0.23695, saving model to /ceph/aissac/ntuple_for_graphs/prod_2018_v2_processed_v5_THESIS/trimmed_200000_and_cut_puppiWeightNoLep_greater_0_and_deltaR_smaller_0point5/Graphs_DYJetsToLL_M-50_genuineTaus_and_jets/Output_Keras_NodeAndEdgeFeat/keras_nodeAndEdgeFeat_bestmodel.h5
Epoch

In [13]:
# NN output plot
predictions = model.predict(X_test)
print(type(predictions))
#print(predictions)

matplotlib.rcParams.update({'font.size': 16})
lw = 2
xyLabelFontSize = 20
xLabelPad = 10
yLabelPad = 15

def createFigure():
    fig, ax = plt.subplots(figsize=(10,7))
    ax.tick_params(pad=7)
    return fig, ax

# TODO: check which order is actually signal (genuineTau) and which are background (fakeTau)
genuineTau_decisions = predictions[:,0]
fakeTau_decisions = predictions[:,1]

plt.figure(figsize=(9,7))

plt.hist(genuineTau_decisions, label='Genuine Taus', 
        histtype='step', # lineplot that's unfilled
        density=True, # normalize to form a probability density
        linewidth=lw)
plt.hist(fakeTau_decisions, label='Jets', 
        histtype='step', # lineplot that's unfilled
        density=True, linewidth=lw) # normalize to form a probability density
plt.xlabel('Neural Network output') # add x-axis label
plt.ylabel('Arbitrary units') # add y-axis label
plt.legend(loc="upper center") # add legend
plt.savefig(path.join(outputFolder, "NN_output.png"))
plt.clf()

from sklearn.metrics import roc_curve, auc
# most tutorials slice the prediction for whatever reason with [:,1] but why?
# predictions_ = predictions[:, 1]

fpr, tpr, _ = roc_curve(y_test.argmax(axis=1), predictions[:, 1])
roc_auc = auc(fpr, tpr) # area under curve (AUC), ROC = Receiver operating characteristic

fig, ax = createFigure()
ax.plot(fpr, tpr, label=f'ROC (area = {roc_auc:.2f})', linewidth=lw)
ax.plot([0, 1], [0, 1], '--', color='red', label='Luck', linewidth=lw)
ax.set_xlabel('False Positive Rate') 
ax.set_ylabel('True Positive Rate')
ax.legend()
ax.grid()
outputFilePath = path.join(outputFolder, 'ROC.png')
plt.savefig(outputFilePath)
plt.clf()

print("\n")
print(history.history)

# Plot accuracy of NN
fig, ax = createFigure()
ax.plot(history.history['accuracy'], label='Training', linewidth=lw)
ax.plot(history.history['val_accuracy'], label='Validation', linewidth=lw)
ax.set_ylabel('Accuracy', labelpad=xLabelPad, fontsize=xyLabelFontSize)
ax.set_xlabel('Epoch', labelpad=yLabelPad, fontsize=xyLabelFontSize)
ax.legend()
outputFilePath = path.join(outputFolder, 'accuracy.png')
plt.savefig(outputFilePath)
plt.clf()
# Plot loss of NN
fig, ax = createFigure()
ax.plot(history.history['loss'], label="Training",linewidth=lw)
ax.plot(history.history['val_loss'], label="Validation", linewidth=lw)
ax.set_ylabel('Loss', labelpad=xLabelPad, fontsize=xyLabelFontSize)
ax.set_xlabel('Epoch', labelpad=yLabelPad, fontsize=xyLabelFontSize)
ax.legend()
outputFilePath = path.join(outputFolder, 'epochloss.png')
plt.savefig(outputFilePath)
plt.clf()



# evaluate the model
_, train_acc = model.evaluate(X_train, y_train, verbose=1)
_, test_acc = model.evaluate(X_test, y_test, verbose=1)
_, valid_acc = model.evaluate(X_val, y_val, verbose=1)
print(f'Train: {train_acc:.3f}, Valid: {valid_acc:.3f}, Test: {test_acc:.3f}, AUC: {roc_auc:.3f}')

<class 'numpy.ndarray'>


{'loss': [0.27760863304138184, 0.24260340631008148, 0.236345574259758, 0.23141776025295258, 0.22689875960350037, 0.2227974236011505, 0.2188325673341751, 0.21586543321609497, 0.2128785252571106, 0.21016502380371094, 0.20741058886051178, 0.20530423521995544, 0.20333971083164215, 0.20123834908008575, 0.1992451697587967, 0.19728560745716095, 0.19575738906860352, 0.19383575022220612, 0.19230349361896515, 0.1905279904603958, 0.18913604319095612, 0.18766145408153534, 0.18558406829833984, 0.1842328906059265, 0.18309332430362701, 0.18209059536457062, 0.1808345913887024, 0.18014033138751984, 0.1788051873445511, 0.17777903378009796], 'accuracy': [0.8932530879974365, 0.9056671857833862, 0.9076747298240662, 0.9093225002288818, 0.9111069440841675, 0.912704348564148, 0.9143809080123901, 0.9154170751571655, 0.916517972946167, 0.9176045060157776, 0.9190219640731812, 0.9202020764350891, 0.921159029006958, 0.9219721555709839, 0.9226557016372681, 0.9236055016517639, 0.92410922050

<Figure size 648x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>