In [49]:
import time

start_time = time.time()

# Train a model using XGBoost

## Imports

In [50]:
# Standard library imports
import os
import random
import logging

# Third-party library imports
import numpy as np
import pandas as pd

# Typing imports
from typing import List, Tuple


## Variables

In [51]:
# Parameters
BATCH_SIZE = 32 # What batch file to use?
EXCLUDE_AUXILIARY = False # Whether to exclude auxiliary pulses
IS_TRAINING = True # Whether to train the model
# If either the event or time limit is reached the process will exit
EVENT_LIMIT = 30
TIME_LIMIT_HOURS = 1
PULSE_AMOUNT = 200 # Amount of pulses to use for features
TARGET_LABELS=['azimuth', 'zenith']
VALIDATION_SPLIT=0.2 # Percentage of data to use for validation


# Directories
DATA_DIR = "../data"
SET = 'train' if IS_TRAINING else 'test'

# logging
LOG_LEVEL = logging.INFO

# Setup logging

In [52]:
# set up logging
logging.basicConfig(filename='./artifacts/info.log', level=LOG_LEVEL, format='%(asctime)s %(levelname)s %(message)s')

## Functions

In [53]:
def seed_it_all(seed=7):
    """ Attempt to be Reproducible """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
seed_it_all(10)

### For optimization

In [54]:

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        logging.info(f'Optimizing col {col}')
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    logging.info('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    logging.info('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

def import_data(file: str):
    """create a dataframe and optimize its memory usage"""
    function_name = f"read_{file.split('.')[-1]}"
    function = getattr(pd, function_name)
    df = function(file)
    df = reduce_mem_usage(df)
    return df

def get_event_df(batch_df: pd.DataFrame, sensor_geometry: pd.DataFrame, event_id: int) -> pd.DataFrame:
    """
    Get a DataFrame for a specific event.

    Parameters:
    train_batch_df (pandas.DataFrame): The batch DataFrame.
    sensor_geometry (pandas.DataFrame): The sensor geometry DataFrame.
    event_id (str): The event identifier.

    Returns:
    pandas.DataFrame: A DataFrame containing data for the specified event.
    """
    if EXCLUDE_AUXILIARY:
        batch_df = batch_df[~batch_df['auxiliary']].drop(columns=['auxiliary'])
    
    event_df = batch_df[batch_df['event_id'] == event_id]
        
    event_df = pd.merge(
        left=event_df,
        right=sensor_geometry,
        how='left',
        # blocksize=64000000 # = 64 Mb chunks,
    ).reset_index()
    
    return event_df.drop(columns=['event_id', 'sensor_id'])

### For scoring

## Test input preparation

In [55]:
# test_batch_df = pd.read_parquet(f'{DATA_DIR}/{SET}/batch_1.parquet' ).reset_index()


# test_batch_df.head(1)


In [56]:
def make_input_vector_shape(df: pd.DataFrame) -> pd.DataFrame:
    """Corrects the shape of the input vector.

    Args:
        df (pd.DataFrame): The input dataframe not sized.

    Returns:
        pd.DataFrame: The newly sized dataframe that has the correct shape and filled with zeros.
    """
    if len(df) < PULSE_AMOUNT:
        
        blank_df = pd.DataFrame(
                index=range(len(df), PULSE_AMOUNT), columns=df.columns
            ).fillna(0)
        return pd.concat([df, blank_df], ignore_index=True)
        
    elif len(df) > PULSE_AMOUNT:
        return df.head(PULSE_AMOUNT)
        
    else:
        return df

In [57]:
meta_df = pd.read_parquet(f'{DATA_DIR}/{SET}_meta.parquet')

In [58]:
def get_input_vector(df: pd.DataFrame, event_id: int, is_training=False) -> pd.DataFrame:
    """Changes the rows of a dataframe to columns

    Args:
        df (pd.DataFrame): The dataframe to be converted that currently has observations in rows

    Returns:
        pd.DataFrame: A single observation in columns
    """
    df = make_input_vector_shape(df)
    df = df.stack().reset_index()
    df['features'] = df['level_0'].astype(str) + '_' + df['level_1']
    df = df.drop(columns=['level_0','level_1']).set_index('features')
    df = df.T.set_index(pd.Index([event_id]))
    df.index.name = 'event_id'
    
    drop_cols = [ 'event_id']
    
    if is_training:
        drop_cols.extend(['azimuth','zenith'])
    
    return pd.merge(
        df, 
        meta_df[meta_df['event_id']== event_id][drop_cols], 
        on='event_id', 
        how='inner'
    ).set_index('event_id')

In [59]:
sensor_geometry = import_data(f'{DATA_DIR}/sensor_geometry.csv')

In [60]:
# event_df = get_event_df(test_batch_df, sensor_geometry, 24)
# event_df.head(3)

In [61]:
# input_vec = get_input_vector(event_df, 24)
# input_vec

## Split the data

In [62]:
import datetime
from typing import Optional, List


def data_generator(
    batch_paths: List[str],
    # sequence_length: int,
    # batch_size: int = BATCH_SIZE
    is_training = False
):

    """Emits a single event training example to be called by the model.fit_generator() method.

    Args:
        batch_paths (List[str]): A list of paths to the batch files
        sensor_geometry_df (pd.DataFrame): The sensor geometry dataframe
        meta_df (pd.DataFrame): The dataframe containing the meta data
        sequence_length (int): The length of the pulse sequence to use for training

    Yields:
        _type_: _description_
    """
    batch_dtypes = {'event_id': 'int32', 'sensor_id': 'int16',
                    'time': 'int32', 'charge': 'float16', 'auxiliary': 'int8'}
    
    av_batch_time_secs = None
    av_event_time_secs = None
    train_start_time = time.time()
    events_processed = 0


    for i, batch_path in enumerate(batch_paths):

        batch_id = int(batch_path.split('/')[-1].split('.')[0].split('_')[-1])
        
        logging.info(f'Processing batch {batch_id} of {len(batch_paths)}')
        
        output_df: Optional[pd.DataFrame] = None
        
        batch_df = pd.read_parquet(batch_path).reset_index()
            # type: ignore        
        
        # get the current date and time
        now = datetime.datetime.now()
        # create a date string with the format day-month-year-hour:minute
        date_string = now.strftime('%d-%m-%Y-%H:%M')
        # define the file path
        file_path = f'artifacts/{SET}/{date_string}/batch_{batch_id}.csv'
        parent_dir = os.path.dirname(file_path)
        os.makedirs(parent_dir, exist_ok=True)
            
        # Loop through unique event IDs
        events = batch_df['event_id'].unique()
        
        for j, event_id in enumerate(events):
            
            logging.info(f'Processing event {event_id} of {len(events)} in batch {batch_id}')
            
            event_df = get_event_df(batch_df, sensor_geometry, event_id)
            
            input_vec =  get_input_vector(event_df, event_id, is_training)
            
            # check if a DataFrame exists
            if output_df is not None:
                
                output_df = pd.concat([ output_df, input_vec])
                input_vec.to_csv(file_path, mode='a', header=False, index=True)
            else:
                # handle the case where the DataFrame does not exist
                output_df = input_vec
                output_df.to_csv(file_path, index=True, index_label='event_id')
            
            
            # Time tracking
            current_time = time.time() - train_start_time
            mins = current_time / 60
            logging.info(f"Total time taken so far: {round(mins, 2)} Minutes")

            av_event_time_secs = current_time if av_event_time_secs is None else (av_event_time_secs + current_time) / j + 1
            
            logging.info(f'Average event time: {round(av_event_time_secs, 2)} Seconds')

            remaining_events = len(events) - j - 1
            remaining_event_minutes = (av_event_time_secs * remaining_events)
            
            logging.info(
                f"""
                    Remaining Events to process for batch: {remaining_events}. 
                    Est time remaining to process: { round(remaining_event_minutes / 60 / 60, 2)} Hours
                    """
                )
            
            
            events_processed += 1
            
            logging.info(
                f"""
                    Total events processed so far: {events_processed}
                """
                )
            
            if len(output_df) == BATCH_SIZE:
                
                yield output_df
                
                output_df = None
                
                
        if batch_df is not None and False:
        
            file_path = f'artifacts/{SET}/{date_string}/{batch_id}.npy'
            # create the parent directories if they don't exist
            parent_dir = os.path.dirname(file_path)
            
            os.makedirs(parent_dir, exist_ok=True)

            batch_df.to_numpy(file_path)
            
            current_time = time.time() - train_start_time
            av_batch_time_secs = current_time if av_batch_time_secs is None else (av_batch_time_secs + current_time) / i + 1
            
            logging.info(
                f"""
                    Average batch time: {round(av_batch_time_secs / 60, 2)} Minutes
                """
                )
            
            remaining_batches = len(events) - i - 1
            remaining_batch_hours = (av_batch_time_secs * remaining_batches) / 60 / 60
        
            logging.info(
                f"""
                    Remaining Events to process for batch: {remaining_batches}, Est time remaining to process: {round(remaining_batch_hours, 2)} Hours
                """
                )

In [63]:
batch_directory = f'{DATA_DIR}/{SET}'
batch_file_paths = [f'{batch_directory}/{file}' for file in os.listdir(batch_directory) if os.path.isfile(os.path.join(batch_directory, file))]

In [64]:
file_split = int(len(batch_file_paths) * VALIDATION_SPLIT)

training_batch_file_paths = batch_file_paths[:-file_split]
validation_batch_file_paths = batch_file_paths[-file_split:]

len(training_batch_file_paths), len(validation_batch_file_paths)

(528, 132)

In [65]:
# create a generator object
train_data_gen = data_generator(training_batch_file_paths, True)

In [66]:
val_data_gen = data_generator(validation_batch_file_paths, True)

In [67]:
# Test Generator
# next(train_data_gen)

## Train the model

In [68]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [69]:
# len(X_test), len(y_test)
# TIME_LIMIT_HOURS
# X_test, y_test

## Create Pipeline

In [70]:
# define a pipeline to preprocess the input and train the model

params = {'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 100}
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgb.XGBRegressor(params))
])



In [71]:

for step in range(EVENT_LIMIT):
    
    print('Training batch: ', step, ' of ', EVENT_LIMIT, ' batches of ', BATCH_SIZE, ' events.')

    # split the data into training and testing sets
    example = next(train_data_gen)
    X_train, y_train  = example.drop(columns=TARGET_LABELS), example[TARGET_LABELS]
    # len(X_train), len(y_train),

    # fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    

Training batch:  0  of  30  batches of  32  events.


XGBoostError: [14:08:36] ../src/objective/objective.cc:26: Unknown objective function: `{'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 100}`
Objective candidate: survival:aft
Objective candidate: binary:hinge
Objective candidate: multi:softmax
Objective candidate: multi:softprob
Objective candidate: rank:pairwise
Objective candidate: rank:ndcg
Objective candidate: rank:map
Objective candidate: reg:squarederror
Objective candidate: reg:squaredlogerror
Objective candidate: reg:logistic
Objective candidate: binary:logistic
Objective candidate: binary:logitraw
Objective candidate: reg:linear
Objective candidate: reg:pseudohubererror
Objective candidate: count:poisson
Objective candidate: survival:cox
Objective candidate: reg:gamma
Objective candidate: reg:tweedie
Objective candidate: reg:absoluteerror

Stack trace:
  [bt] (0) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x36cf3d) [0x7f5cab591f3d]
  [bt] (1) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x36d629) [0x7f5cab592629]
  [bt] (2) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x2daa02) [0x7f5cab4ffa02]
  [bt] (3) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x2e4575) [0x7f5cab509575]
  [bt] (4) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x2ddc29) [0x7f5cab502c29]
  [bt] (5) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x70) [0x7f5cab352fb0]
  [bt] (6) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/lib-dynload/../../libffi.so.8(+0xa052) [0x7f61ef798052]
  [bt] (7) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/lib-dynload/../../libffi.so.8(+0x88cd) [0x7f61ef7968cd]
  [bt] (8) /home/aj/anaconda3/envs/KAG_IC_NEU/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x2fe) [0x7f61efa22d6e]



## Score

In [None]:
def angular_dist_score(az_true:float, zen_true:float, az_pred:float, zen_pred:float):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    The lower the angle, the more similar the two vectors are meaning the score is better.
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two Cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against numerical instability
    # that might otherwise occur from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

In [None]:

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

validation_batches = int(EVENT_LIMIT * VALIDATION_SPLIT)
for step in range(validation_batches):
    
    print('Validation batch: ', step, ' of ', validation_batches, ' batches of ', BATCH_SIZE, ' events.')

    # split the data into training and testing sets
    example = next(val_data_gen)
    X, y = example.drop(columns=TARGET_LABELS), example[TARGET_LABELS]
    # len(X_train), len(y_train),

    # fit the pipeline on the training data
    score=pipeline.score(X, y)
    print('Accuracy: ',round(score,2), 'on validation set', step)
    # score the XGBoost model on the test set   
    y_pred = pipeline.predict(X)
    print('MSE:',  mean_squared_error(y, y_pred))
    print('Mean Absolute Error:',  mean_absolute_error(y, y_pred))
    print('R2:',  r2_score(y, y_pred))
    print('R2:',  r2_score(y, y_pred))
    
    angular_dist_score()


Validation batch:  0  of  6  batches of  32  events.
Accuracy:  -0.4 on validation set 0
MSE: 2.7766727959153927
Mean Absolute Error: 1.2594750854845769
R2: -0.4009029284112363
Validation batch:  1  of  6  batches of  32  events.
Accuracy:  -0.48 on validation set 1
MSE: 3.0231595757737066
Mean Absolute Error: 1.2974820096864081
R2: -0.4822985101708863
Validation batch:  2  of  6  batches of  32  events.
Accuracy:  -0.42 on validation set 2
MSE: 3.053349252016077
Mean Absolute Error: 1.3045031556225826
R2: -0.4160809329580555
Validation batch:  3  of  6  batches of  32  events.
Accuracy:  -0.28 on validation set 3
MSE: 2.5592359873781643
Mean Absolute Error: 1.2190662324317112
R2: -0.28171035135580524
Validation batch:  4  of  6  batches of  32  events.
Accuracy:  -0.25 on validation set 4
MSE: 1.7423941673188166
Mean Absolute Error: 1.0080166739294607
R2: -0.25425004146911945
Validation batch:  5  of  6  batches of  32  events.
Accuracy:  -0.31 on validation set 5
MSE: 2.8851564122856

## Make the prediction

In [None]:
batch_ids = meta_df['batch_id'].unique().compute().values # type: ignore

In [None]:
end_time = time.time()
total_time = end_time - start_time
total_hours = total_time / 60 / 60
print("Total time taken: ", round(total_hours,2), "Hours")

In [None]:

# evaluate the accuracy of the model
accuracy = [ angular_dist_score(y_test.iloc[i]['azimuth'], y_test.iloc[i]['zenith'], y_pred[i][0], y_pred[i][1]) for i in range(len(y_test))] 
print(f'Accuracy: {sum(accuracy) / len(accuracy)}')