# Predict Neutrino Direction with an LSTM

Using a Tensorflow LSTM layer using the event time steps to the input to predict the Neutrino Direction azimuth and zenith angle

## Imports

In [1]:
# Standard library imports
import os
import random
import math
import logging
from sys import getsizeof
import sys
from datetime import datetime
sys.path.append('..')

# Third-party library imports
import numpy as np
import pandas as pd
import dask.dataframe as dd
import tensorflow as tf


# Typing imports
from typing import List, Tuple

from scripts.utils import seed_it_all, convert_bytes_to_gmbkb


2023-02-23 19:08:29.290911: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-23 19:08:29.367184: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-23 19:08:29.367197: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-23 19:08:29.796706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device NOT found')
else:
  print('Found GPU at: {}'.format(device_name))

GPU device NOT found


2023-02-23 19:08:30.372774: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-23 19:08:30.394232: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-23 19:08:30.394359: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-23 19:08:30.394418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: c

## Variables

In [3]:
# Parameters
MODEL_TYPE='lstm' # Which model to use
IS_TRAINING = True # Whether to train the model
SEED=10

# TIME_LIMIT_HOURS = 1

LSTM_UNITS = 64
EPOCHS=10
STEPS_PER_EPOCH=100
PULSE_AMOUNT = 100 # Amount of pulses to use for features
BATCH_SIZE = 32
LEARNING_RATE = 0.001
FEATURES = [ 'time', 'charge', 'auxiliary', 'x', 'y', 'z'] # Which features to use as the model input

# Directories
DATA_DIR = "../data"
SET = 'train' if IS_TRAINING else 'test'

# logging
LOG_LEVEL = logging.INFO

In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


2023-02-23 19:08:30.649408: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-23 19:08:30.649525: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## Logging

In [5]:
# Setup logging
logging.basicConfig(filename='artifacts/info.log', level=LOG_LEVEL, format='%(asctime)s %(levelname)s %(message)s')

## Functions

In [6]:
seed_it_all(SEED)
# set the seed for the random number generator
tf.random.set_seed(42)


### For optimization

## Load the dataframes

In [7]:
sensor_dtypes = { 'x': 'float16', 'y': 'float16', 'z': 'float16' }
sensor_geometry_df = pd.read_csv(f'{DATA_DIR}/sensor_geometry.csv', dtype=sensor_dtypes) # type: ignore
sensor_geometry_df.head(1)

Unnamed: 0,sensor_id,x,y,z
0,0,-256.25,-521.0,496.0


In [8]:
convert_bytes_to_gmbkb(getsizeof(sensor_geometry_df))

'70.69 KB'

In [9]:

meta_dtypes = {'batch_id': 'int16', 'event_id': 'Int64', 'first_pulse_index': 'int32', 'last_pulse_index': 'int32', 'azimuth': 'float16', 'zenith': 'float16'}
meta_df = pd.read_parquet(f'{DATA_DIR}/{SET}_meta.parquet').astype(meta_dtypes)
meta_df.head(1)

Unnamed: 0,batch_id,event_id,first_pulse_index,last_pulse_index,azimuth,zenith
0,1,24,0,60,5.03125,2.087891


In [10]:
convert_bytes_to_gmbkb(getsizeof(meta_df))

'2.83 GB'

In [11]:
batch_directory = f'{DATA_DIR}/{SET}'
batch_file_paths = [f'{batch_directory}/{file}' for file in os.listdir(batch_directory) if os.path.isfile(os.path.join(batch_directory, file))]
print('First batch file path 3 Samples:')


First batch file path 3 Samples:


In [12]:
training_batch_file_paths = batch_file_paths[:-1]
validation_batch_file_paths = batch_file_paths[-1:]


In [13]:
sample_batch_df= pd.read_parquet(batch_file_paths[1])
convert_bytes_to_gmbkb(getsizeof(meta_df))

'2.83 GB'

## Build the dataset

In [14]:
def get_event_input_observation(batch: pd.DataFrame, event_id: int, sequence_length: int, sensor_geometry: pd.DataFrame,) -> np.array: 
    """Gets a single event input observation for the model

    Args:
        batch (pd.DataFrame): The batch dataframe
        event_id (int): The event id to find within the batch df
        sequence_length (int): The length of the sequence to use
        sensor_geometry (pd.DataFrame): The sensor geometry dataframe

    Returns:
        np.array: A single event input observation for the model
    """
    # The event dataframe with a list of pulse readings
    event_data = batch[batch['event_id'] == event_id]
    
    merged_df = pd.merge(event_data, sensor_geometry, on='sensor_id', how='left')
    
    # get the first N pulses with N being the sequence length
    sequence = merged_df.head(sequence_length)[FEATURES]
    n_missing = 100 - len(sequence)
    if n_missing > 0:
        df_missing = pd.DataFrame(0, index=np.arange(n_missing), columns=sequence.columns)
        sequence = pd.concat([sequence, df_missing])
        
    return sequence.values

In [15]:
def get_event_data(batch: pd.DataFrame, event_id: int, sequence_length: int, sensor_geometry: pd.DataFrame, meta_data: pd.DataFrame):
    
    input_sequence = get_event_input_observation(batch, event_id, sequence_length, sensor_geometry)
    
    # get the target labels 
    target_labels = meta_data[meta_data['event_id'] == event_id][['azimuth', 'zenith']].values[0] 
    
    # reshape the sequence and target labels to be fed into the model
    return np.reshape(input_sequence, (1, sequence_length, len(FEATURES))), np.reshape(target_labels, (1, 2))

In [16]:
import tensorflow as tf

def data_generator(
    batch_paths: List[str],
    sensor_geometry: pd.DataFrame,
    meta_data: pd.DataFrame,
    sequence_length: int,
    batch_size: int = BATCH_SIZE
):
    """Emits a single event training example to be called by the model.fit_generator() method.

    Args:
        batch_paths (List[str]): A list of paths to the batch files
        sensor_geometry_df (pd.DataFrame): The sensor geometry dataframe
        meta_df (pd.DataFrame): The dataframe containing the meta data
        sequence_length (int): The length of the pulse sequence to use for training

    Yields:
        _type_: _description_
    """
    batch_dtypes = {'event_id': 'int32', 'sensor_id': 'int16',
                    'time': 'int32', 'charge': 'float16', 'auxiliary': 'int8'}

    for batch_path in batch_paths:

        batch = pd.read_parquet(batch_path).reset_index().astype(batch_dtypes)

        output_batch_x = None
        output_batch_y = None

        for event_id in batch['event_id'].unique():

            x_batch, y_batch = get_event_data(
                batch, event_id, sequence_length, sensor_geometry, meta_data)

            x_tensor = tf.constant(x_batch)
            y_tensor = tf.constant(y_batch)

            if output_batch_x is None and output_batch_y is None:
                output_batch_x = x_tensor
                output_batch_y = y_tensor
                logging.debug('Output_batch initializing')

            else:
                output_batch_x = tf.concat([output_batch_x, x_tensor], axis=0)
                output_batch_y = tf.concat([output_batch_y, y_tensor], axis=0)

                logging.debug('Output_batch extending: %s',
                              len(output_batch_x))

            if len(output_batch_x) == batch_size:
                output = output_batch_x[:], output_batch_y[:]
                output_batch_x = None
                output_batch_y = None
                yield output


In [17]:
# create a generator object
train_data_gen = data_generator(training_batch_file_paths, sensor_geometry_df, meta_df, sequence_length=PULSE_AMOUNT)
val_data_gen = data_generator(validation_batch_file_paths, sensor_geometry_df, meta_df, sequence_length=PULSE_AMOUNT)

## Build the model

In [18]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, CuDNNLSTM
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [19]:
RUN_ID = f'{MODEL_TYPE}_{datetime.now().strftime("%d%m%Y%H%M%S")}'.replace(' ', '_')

In [20]:


# define the callback to save checkpoints
callbacks = [

    ModelCheckpoint(
        filepath=f"checkpoints/{RUN_ID}/{'epoch:02d'}",
        save_weights_only=True,
        save_freq='epoch'),

    ModelCheckpoint(f'checkpoints/{RUN_ID}/best_model_weights.h5',
                    save_best_only=True,
                    save_weights_only=True,
                    monitor='val_loss',
                    mode='min',
                    verbose=1),

    EarlyStopping(
        monitor='val_loss', 
        patience=10
    ),

    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=2,
        min_lr=1e-8 # type: ignore
    )
]


In [21]:

# Define the LSTM model
model = Sequential()
model.add(LSTM(LSTM_UNITS,input_shape=(PULSE_AMOUNT, len(FEATURES))))
model.add(Dense(2, activation='linear')) # set the number of output neurons to 2 and the activation function to linear

# Compile the model
model.compile(
    loss='mean_squared_error', 
    optimizer=Adam(learning_rate=LEARNING_RATE), 
    metrics=['accuracy']
)


## Train the model

In [22]:
import mlflow
mlflow.tensorflow.autolog(every_n_iter=1)

In [23]:

print('--- Starting trial: %s' % RUN_ID)

mlflow.set_experiment('LSTM')
mlflow.start_run(run_name=RUN_ID, description=f'LSTM model using {PULSE_AMOUNT} sequence length with {LSTM_UNITS}')
mlflow.log_param('batch_size', BATCH_SIZE)
mlflow.log_param('sequence_length', PULSE_AMOUNT)
mlflow.log_param('lstm_units', LSTM_UNITS)

# mlflow.log_param('learning_rate', LEARNING_RATE)
# mlflow.log_param('epochs')

history = model.fit(
    train_data_gen,
    steps_per_epoch=STEPS_PER_EPOCH, 
    validation_data=val_data_gen, validation_steps=100, epochs=100,
    
    # steps_per_epoch=100
    
    
    # epochs=EPOCHS, 
    # batch_size=BATCH_SIZE,
    # callbacks=callbacks,
    # validation_data=val_data_gen,
    # validation_steps=math.ceil(STEPS_PER_EPOCH/4),
    # validation_batch_size=math.ceil(STEPS_PER_EPOCH/4),
    # validation_freq=2,
    # use_multiprocessing=True,
    # workers=4,
    # verbose='1'
)

mlflow.end_run()

--- Starting trial: lstm_23022023190838
Epoch 1/10
Epoch 2/10
Epoch 2: val_loss improved from inf to 2.13189, saving model to checkpoints/lstm_23022023190838/best_model_weights.h5
Epoch 3/10

Process Keras_worker_ForkPoolWorker-6:
Process Keras_worker_ForkPoolWorker-7:
Process Keras_worker_ForkPoolWorker-8:
Process Keras_worker_ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/aj/anaconda3/envs

In [None]:
# Evaluate the model
loss = model.evaluate(test_sequences, test_values)

In [None]:
mlflow.end_run()

## For scoring

In [None]:
def angular_dist_score(az_true:float, zen_true:float, az_pred:float, zen_pred:float):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    The lower the angle, the more similar the two vectors are meaning the score is better.
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two Cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against numerical instability
    # that might otherwise occur from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))