In [12]:
import subprocess
from pathlib import Path

import h5py
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

BATCH_SIZE = 8192
LATENT_DIM = 32
VALID_FRAC = 0.25
SEED = 101588
NUM_TRAIN_SAMPLES = 5000000

GEN_UPDATES = 8
D_LR = 0.0003
G_LR = 0.0008
FPR_THRESH = 1e-5

BACKGROUND_FNAME = Path("background.h5")
SIGNAL_FNAMES = {
    "A-4_leptons": "https://zenodo.org/record/7152590/files/Ato4l_lepFilter_13TeV_filtered.h5?download=1",
    "leptoquarks-b_tau": "https://zenodo.org/record/7152599/files/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5?download=1",
    "h_0-tau_tau": "https://zenodo.org/record/7152614/files/hToTauTau_13TeV_PU20_filtered.h5?download=1",
    "h_plus-tau_nu": "https://zenodo.org/record/7152617/files/hChToTauNu_13TeV_PU20_filtered.h5?download=1"
}

In [13]:
def download_dataset(fname, url):
    subprocess.run(f"wget -O {fname} {url}", shell=True)
    
if not BACKGROUND_FNAME.exists():
    download_dataset(
        str(BACKGROUND_FNAME),
        "https://zenodo.org/record/5046428/files/background_for_training.h5?download=1"
    )

for signal, url in SIGNAL_FNAMES.items():
    fname = signal + ".h5"
    if not Path(fname).exists():
        download_dataset(fname, url)

In [14]:
def load_dataset(dataset, N):
    X = dataset[:N]
    X, y = np.split(X, [3], axis=-1)
    return X, y[:, :, 0]


with h5py.File(BACKGROUND_FNAME, "r") as f:
    print("Features:", ", ".join([i.decode() for i in f["Particles_Names"][:]]))
    print("Event types:", ", ".join([i.decode() for i in f["Particles_Classes"][:]]))
    print("Total background events:", len(f["Particles"]))
    print(f"Loading {NUM_TRAIN_SAMPLES} events")
    X, masks = load_dataset(f["Particles"], NUM_TRAIN_SAMPLES)

_, num_events, num_features = X.shape
FEATURE_DIM = num_events * num_features
X = X.reshape(-1, FEATURE_DIM)

Features: Pt, Eta, Phi, Class
Event types: MET_class_1, Four_Ele_class_2, Four_Mu_class_3, Ten_Jet_class_4
Total background events: 13451915
Loading 5000000 events


In [15]:
(
    train_bg_events,
    valid_bg_events,
    train_bg_masks,
    valid_bg_masks
) = train_test_split(X, masks, test_size=VALID_FRAC, random_state=SEED)

In [16]:
scaler = StandardScaler()
scaler.fit(train_bg_events)

def preprocess(X, mask):
    X = scaler.transform(X)
    X = X.reshape(-1, num_events, num_features)
    X[mask == 0] *= 0
    X = X.reshape(-1, FEATURE_DIM)
    return X

train_bg_events = preprocess(train_bg_events, train_bg_masks)
valid_bg_events = preprocess(valid_bg_events, valid_bg_masks)

In [17]:
valid_signal_events = []
valid_signal_masks = []

for signal in SIGNAL_FNAMES:
    with h5py.File(signal + ".h5", "r") as f:
        dataset = f["Particles"]
        n = int(0.25 * len(dataset))
        print(f"Loading {n} events from signal {signal}")
        events, masks = load_dataset(dataset, n)
    events = events.reshape(-1, FEATURE_DIM)
    events = preprocess(events, masks)
    valid_signal_events.append(events)
    valid_signal_masks.append(masks)

valid_signal_events = np.concatenate(valid_signal_events)
valid_signal_masks = np.concatenate(valid_signal_masks)

valid_y = np.concatenate([
    np.zeros((len(valid_bg_events), 1)),
    np.ones((len(valid_signal_events), 1))
])
valid_events = np.concatenate([valid_bg_events, valid_signal_events])
valid_masks = np.concatenate([valid_bg_masks, valid_signal_masks])

idx = np.random.permutation(len(valid_y))
valid_y = valid_y[idx]
valid_events = valid_events[idx]
valid_masks = valid_masks[idx]
valid_X = (valid_events, valid_masks)

Loading 13992 events from signal A-4_leptons
Loading 85136 events from signal leptoquarks-b_tau
Loading 172820 events from signal h_0-tau_tau
Loading 190068 events from signal h_plus-tau_nu


In [18]:
generator_input = tf.keras.Input((LATENT_DIM,))
x = tf.keras.layers.Dense(128, activation="relu")(generator_input)
x = tf.keras.layers.Dense(256, activation="relu")(x)
generator_output = tf.keras.layers.Dense(FEATURE_DIM, activation="linear")(x)
generator_mask = tf.keras.layers.Dense(num_events, activation="sigmoid")(x)
generator = tf.keras.Model(
    inputs=generator_input,
    outputs=[generator_output, generator_mask]
)

discriminator_input = tf.keras.Input((FEATURE_DIM,))
disc_x = tf.keras.layers.Dense(256, activation="relu")(discriminator_input)
disc_x = tf.keras.layers.Dense(128, activation="relu")(disc_x)
disc_x = tf.keras.layers.Dense(64, activation="relu")(disc_x)

discriminator_mask = tf.keras.Input((num_events,))
disc_mask = tf.keras.layers.Dense(256, activation="relu")(discriminator_mask)
disc_mask = tf.keras.layers.Dense(128, activation="relu")(disc_mask)
disc_mask = tf.keras.layers.Dense(64, activation="relu")(disc_mask)

disc_x = tf.keras.layers.Concatenate()([disc_x, disc_mask])
disc_x = tf.keras.layers.Dense(256, activation="relu")(disc_x)
disc_x = tf.keras.layers.Dense(512, activation="relu")(disc_x)
disc_x = tf.keras.layers.Dense(1, activation="linear")(disc_x)
discriminator = tf.keras.Model(
    inputs=[discriminator_input, discriminator_mask],
    outputs=disc_x
)

In [19]:
class TPR(tf.keras.metrics.Metric):
    def __init__(self, k, **kwargs):
        self.k = k
        super().__init__(**kwargs)

    def reset_state(self):
        self.background_preds = tf.convert_to_tensor(())
        self.signal_preds = tf.convert_to_tensor(())

    def update_state(self, y_true, y_pred, sample_weight=None):
        background = y_pred[y_true == 0]
        self.background_preds = tf.concat([self.background_preds, background], axis=0)

        signal = y_pred[y_true == 1]
        self.signal_preds = tf.concat([self.signal_preds, signal], axis=0)

    def result(self):
        k = tf.shape(self.background_preds)[0] - self.k
        threshold = tf.sort(self.background_preds)[k]
        mask = self.signal_preds > threshold
        mask = tf.cast(mask, tf.int64)
        tpr = tf.math.reduce_mean(mask)
        return tpr

threshold_k = int(FPR_THRESH * len(valid_bg_events))

In [30]:
@tf.function
def fudge_mask(mask, noisy=True):
    mask = tf.where(mask == 0, -10., 10.)
    if noisy:
        mask = mask + tf.random.normal(shape=(len(mask), num_events))
    return tf.sigmoid(mask)


class GAN(tf.keras.Model):
    def __init__(self, discriminator, generator, latent_dim, gen_updates):
        super().__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim
        self.gen_updates = gen_updates

        self.d_loss_tracker = tf.keras.metrics.Mean(name="d_loss")
        self.g_loss_tracker = tf.keras.metrics.Mean(name="g_loss")
        self.tpr = TPR(threshold_k)

    def compile(self, d_optimizer, g_optimizer, loss_fn):
        super().compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.loss_fn = loss_fn

    def step_generator(self, batch_size):
        # Sample random points in the latent space
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))

        # Assemble labels that say "all real images"
        misleading_labels = tf.zeros((batch_size, 1))

        # Train the generator (note that we should *not* update the weights
        # of the discriminator)!
        with tf.GradientTape() as tape:
            predictions = discriminator(generator(random_latent_vectors))
            g_loss = loss_fn(misleading_labels, predictions)
        grads = tape.gradient(g_loss, generator.trainable_weights)
        g_optimizer.apply_gradients(zip(grads, generator.trainable_weights))
        return g_loss

    def train_step(self, X):
        real_events, real_mask = X
        batch_size = tf.shape(real_events)[0]

        # train the generator for multiple steps
        # in between a single step of the discriminator
        g_loss = 0
        for i in range(self.gen_updates):
            g_loss += self.step_generator(batch_size)
        g_loss /= self.gen_updates

        # Sample random points in the latent space
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))

        # Decode them to fake events
        generated_events, generated_mask = generator(random_latent_vectors)

        # Combine them with real events
        combined_events = tf.concat([generated_events, real_events], axis=0)
        combined_masks = tf.concat([generated_mask, real_mask], axis=0)

        # Assemble labels discriminating real from fake events
        labels = tf.concat(
            [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0
        )

        # Add random noise to the labels - important trick!
        labels += 0.05 * tf.random.uniform((2 * batch_size, 1))

        # Train the discriminator
        with tf.GradientTape() as tape:
            predictions = discriminator([combined_events, combined_masks])
            d_loss = loss_fn(labels, predictions)
        grads = tape.gradient(d_loss, discriminator.trainable_weights)
        d_optimizer.apply_gradients(zip(grads, discriminator.trainable_weights))

        self.d_loss_tracker.update_state(d_loss)
        self.g_loss_tracker.update_state(g_loss)
        return {
            "d_loss": self.d_loss_tracker.result(),
            "g_loss": self.g_loss_tracker.result(),
        }

    def test_step(self, data):
        # Unpack the data
        x, y = data
        y = fudge_mask(y, noisy=False)

        # Compute predictions
        y_pred = self.discriminator(x, training=False)

        # Updates the metrics tracking the loss
        self.tpr.update_state(y, y_pred)
        return {"tpr": self.tpr.result()}

In [31]:
d_optimizer = tf.keras.optimizers.Adam(learning_rate=D_LR)
g_optimizer = tf.keras.optimizers.Adam(learning_rate=G_LR)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)

gan = GAN(discriminator, generator, LATENT_DIM, GEN_UPDATES)
gan.compile(d_optimizer, g_optimizer, loss_fn)

In [22]:
events = tf.data.Dataset.from_tensor_slices(train_bg_events.astype("float32"))
masks = tf.data.Dataset.from_tensor_slices(train_bg_masks.astype("float32"))
dataset = tf.data.Dataset.zip((events, masks))
dataset = dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE).map(fudge_mask)
history = gan.fit(
    dataset,
    epochs=100,
    validation_data=(valid_X, valid_y),
    validation_batch_size=4 * BATCH_SIZE
)

Epoch 1/100

2023-07-13 14:10:24.410789: W tensorflow/core/framework/op_kernel.cc:1828] OP_REQUIRES failed at strided_slice_op.cc:117 : INVALID_ARGUMENT: slice index -12 of dimension 0 out of bounds.


InvalidArgumentError: in user code:

    File "/local/alec.gunny/ipykernel_2153706/1928866576.py", line 19, in result  *
        threshold = tf.sort(self.background_preds)[k]

    InvalidArgumentError: {{function_node __wrapped__StridedSlice_device_/job:localhost/replica:0/task:0/device:CPU:0}} slice index -12 of dimension 0 out of bounds. [Op:StridedSlice] name: strided_slice/


In [32]:
gan.evaluate(valid_X, valid_y, batch_size=8192)

2023-07-13 14:12:26.198943: W tensorflow/core/framework/op_kernel.cc:1828] OP_REQUIRES failed at strided_slice_op.cc:117 : INVALID_ARGUMENT: slice index -12 of dimension 0 out of bounds.


InvalidArgumentError: Graph execution error:

Detected at node 'strided_slice_1' defined at (most recent call last):
    File "/home/alec.gunny/miniconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/home/alec.gunny/miniconda3/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 728, in start
      self.io_loop.start()
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/home/alec.gunny/miniconda3/lib/python3.9/asyncio/base_events.py", line 596, in run_forever
      self._run_once()
    File "/home/alec.gunny/miniconda3/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once
      handle._run()
    File "/home/alec.gunny/miniconda3/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 516, in dispatch_queue
      await self.process_one()
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 505, in process_one
      await dispatch(*args)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 412, in dispatch_shell
      await result
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 740, in execute_request
      reply_content = await reply_content
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/local/alec.gunny/ipykernel_2153706/1120452504.py", line 1, in <module>
      gan.evaluate(valid_X, valid_y, batch_size=8192)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/engine/training.py", line 2200, in evaluate
      logs = test_function_runner.run_step(
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/engine/training.py", line 4000, in run_step
      tmp_logs = self._function(dataset_or_iterator)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/engine/training.py", line 1972, in test_function
      return step_function(self, iterator)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/engine/training.py", line 1956, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/engine/training.py", line 1944, in run_step
      outputs = model.test_step(data)
    File "/local/alec.gunny/ipykernel_2153706/4190966585.py", line 96, in test_step
      return {"tpr": self.tpr.result()}
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/utils/metrics_utils.py", line 137, in decorated
      raw_result = result_fn(*args)
    File "/home/alec.gunny/miniconda3/envs/hackathon-F1z9dWsj-py3.9/lib/python3.9/site-packages/keras/src/metrics/base_metric.py", line 159, in result_fn
      return ag_result(*args, **kwargs)
    File "/local/alec.gunny/ipykernel_2153706/1928866576.py", line 19, in result
      threshold = tf.sort(self.background_preds)[k]
Node: 'strided_slice_1'
slice index -12 of dimension 0 out of bounds.
	 [[{{node strided_slice_1}}]] [Op:__inference_test_function_27418]