In [None]:
import time

start_time = time.time()

# Train a model using XGBoost

## Imports

In [1]:
# Standard library imports
import os
import random
import math
import logging
import sys

# Third-party library imports
import numpy as np
import pandas as pd
import dask.dataframe as dd

# Typing imports
from typing import List, Tuple

sys.path.append('..')


## Variables

In [216]:
# Parameters
BATCH = 1 # What batch file to use?
EXCLUDE_AUXILIARY = True # Whether to exclude auxiliary pulses
IS_TRAINING = True # Whether to train the model
# If either the event or time limit is reached the process will exit
EVENT_LIMIT = 3000
TIME_LIMIT_HOURS = 1
PULSE_AMOUNT = 200 # Amount of pulses to use for features

# Directories
DATA_DIR = "data"
SET = 'train' if IS_TRAINING else 'test'

# logging
LOG_LEVEL = logging.INFO

# Setup logging

In [217]:
# set up logging
logging.basicConfig(filename='artifacts/info.log', level=LOG_LEVEL, format='%(asctime)s %(levelname)s %(message)s')

## Functions

In [218]:
def seed_it_all(seed=7):
    """ Attempt to be Reproducible """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
seed_it_all(10)

### For optimization

In [219]:
import importlib

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        logging.info(f'Optimizing col {col}')
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    logging.info('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    logging.info('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

def import_data(file: str):
    """create a dataframe and optimize its memory usage"""
    function_name = f"read_{file.split('.')[-1]}"
    function = getattr(pd, function_name)
    df = function(file)
    df = reduce_mem_usage(df)
    return df

def get_event_df(batch_df: dd.DataFrame, sensor_geometry: pd.DataFrame, event_id: int) -> pd.DataFrame:
    """
    Get a DataFrame for a specific event.

    Parameters:
    train_batch_df (pandas.DataFrame): The batch DataFrame.
    sensor_geometry (pandas.DataFrame): The sensor geometry DataFrame.
    event_id (str): The event identifier.

    Returns:
    pandas.DataFrame: A DataFrame containing data for the specified event.
    """
    if EXCLUDE_AUXILIARY:
        batch_df = batch_df[~batch_df['auxiliary']].drop(columns=['auxiliary'])
    
    event_df = batch_df[batch_df['event_id'] == event_id].compute()
        
    event_df = pd.merge(
        left=event_df,
        right=sensor_geometry,
        how='inner',
        on='sensor_id'
    ),
    
    ## Drop columns that are not needed for prediction
    return event_df.drop(columns=['event_id', 'sensor_id'])

### For scoring

In [220]:
def angular_dist_score(az_true:float, zen_true:float, az_pred:float, zen_pred:float):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    The lower the angle, the more similar the two vectors are meaning the score is better.
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two Cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against numerical instability
    # that might otherwise occur from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

## Load the dataframes

In [221]:
sensor_geometry = import_data(f'{DATA_DIR}/sensor_geometry.csv')

In [222]:

meta_dfd = dd.read_parquet(f'{DATA_DIR}/{SET}_meta.parquet', 
    blocksize=64000000 # = 64 Mb chunks
)

## Test input preparation

In [223]:
test_batch_dfd = dd.read_parquet(f'{DATA_DIR}/{SET}/batch_1.parquet', 
        blocksize=64000000 # = 64 Mb chunks,
    ).reset_index()


test_batch_dfd.head(1)

Unnamed: 0,event_id,sensor_id,time,charge,auxiliary
0,24,3918,5928,1.325,True


In [326]:
def make_input_vector_shape(df: pd.DataFrame) -> pd.DataFrame:
    """Corrects the shape of the input vector.

    Args:
        df (pd.DataFrame): The input dataframe not sized.

    Returns:
        pd.DataFrame: The newly sized dataframe that has the correct shape and filled with zeros.
    """
    if len(df) < PULSE_AMOUNT:
        
        blank_df = pd.DataFrame(
                index=range(len(df), PULSE_AMOUNT), columns=df.columns
            ).fillna(0)
        return pd.concat([df, blank_df], ignore_index=True)
        
    elif len(df) > PULSE_AMOUNT:
        return df.head(PULSE_AMOUNT)
        
    else:
        return df

In [327]:
def get_input_vector(df: pd.DataFrame, event_id: int) -> pd.DataFrame:
    """Changes the rows of a dataframe to columns

    Args:
        df (pd.DataFrame): The dataframe to be converted that currently has observations in rows

    Returns:
        pd.DataFrame: A single observation in columns
    """
    df = make_input_vector_shape(df)
    df = df.stack().reset_index()
    df['features'] = df['level_0'].astype(str) + '_' + df['level_1']
    df = df.drop(columns=['level_0','level_1']).set_index('features')
    df = df.T.set_index(pd.Index([event_id]))
    df.index.name = 'event_id'
    
    return pd.merge(
        df, 
        meta_dfd[meta_dfd['event_id']== event_id].compute()[[ 'event_id','azimuth','zenith' ]], 
        on='event_id', 
        how='inner'
    ).set_index('event_id')

In [328]:
event_df = get_event_df(test_batch_dfd, sensor_geometry, 24)
event_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 12 to 36
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   event_id   13 non-null     int64  
 1   sensor_id  13 non-null     int16  
 2   time       13 non-null     int64  
 3   charge     13 non-null     float64
dtypes: float64(1), int16(1), int64(2)
memory usage: 442.0 bytes


Unnamed: 0,time,charge,x,y,z
0,9868,1.375,-9.679688,-79.5,-219.5
1,12201,0.225,35.53125,-364.75,191.375
2,12339,1.225,35.53125,-364.75,191.375


In [329]:
input_vec = get_input_vector(event_df, 24)
input_vec

Unnamed: 0_level_0,0_time,0_charge,0_x,0_y,0_z,1_time,1_charge,1_x,1_y,1_z,...,198_x,198_y,198_z,199_time,199_charge,199_x,199_y,199_z,azimuth,zenith
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,9868.0,1.375,-9.679688,-79.5,-219.5,12201.0,0.225,35.53125,-364.75,191.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.029555,2.087498


## Create a data file

In [330]:
import datetime

av_batch_time_secs = None
av_event_time_secs = None
train_start_time = time.time()
events_processed = 0

batches = meta_dfd['batch_id'].unique().compute().values

for i, batch_id in enumerate(batches):
    
    logging.info(f'Processing batch {batch_id} of {len(batches)}')
    
    # The batch dataframe to be populated with events
    batch_df = None
    
    batch_dfd = dd.read_parquet(f'{DATA_DIR}/{SET}/batch_{batch_id}.parquet', 
        blocksize=64000000 # = 64 Mb chunks,
    ).reset_index()
    
    # get the current date and time
    now = datetime.datetime.now()
    # create a date string with the format day-month-year-hour:minute
    date_string = now.strftime('%d-%m-%Y-%H:%M')
    # define the file path
    file_path = f'artifacts/{SET}/{date_string}/batch_{batch_id}.csv'
    parent_dir = os.path.dirname(file_path)
    os.makedirs(parent_dir, exist_ok=True)
        
    # Loop through unique event IDs
    events = batch_dfd['event_id'].unique().compute().values
    
    for j, event_id in enumerate(events):
        
        logging.info(f'Processing event {event_id} of {len(events)} in batch {batch_id}')
        
        event_df = get_event_df(batch_dfd, sensor_geometry, event_id)
        
        input_vec =  get_input_vector(event_df, event_id)
        
        # check if a DataFrame exists
        if batch_df is not None:
            
            batch_df = pd.concat([ batch_df, input_vec])
            input_vec.to_csv(file_path, mode='a', header=False, index=True)
        else:
            # handle the case where the DataFrame does not exist
            batch_df = input_vec
            batch_df.to_csv(file_path, index=True, index_label='event_id')
         
        
        # Time tracking
        current_time = time.time() - train_start_time
        mins = current_time / 60
        logging.info(f"Total time taken so far: {round(mins, 2)} Minutes")

        av_event_time_secs = current_time if av_event_time_secs is None else (av_event_time_secs + current_time) / j + 1
        
        logging.info(f'Average event time: {round(av_event_time_secs, 2)} Seconds')

        remaining_events = len(events) - j - 1
        remaining_event_minutes = (av_event_time_secs * remaining_events)
        
        logging.info(
            f"""
                Remaining Events to process for batch: {remaining_events}. 
                Est time remaining to process: { round(remaining_event_minutes / 60 / 60, 2)} Hours
                """
            )
        
        
        events_processed += 1
        
        logging.info(
            f"""
                Total events processed so far: {events_processed}
            """
            )
            
    if batch_df is not None:
       
        file_path = f'artifacts/{SET}/{date_string}/{batch_id}.npy'
        # create the parent directories if they don't exist
        parent_dir = os.path.dirname(file_path)
        
        os.makedirs(parent_dir, exist_ok=True)

        batch_df.to_numpy(file_path)
        
        current_time = time.time() - train_start_time
        av_batch_time_secs = current_time if av_batch_time_secs is None else (av_batch_time_secs + current_time) / i + 1
        
        logging.info(
            f"""
                Average batch time: {round(av_batch_time_secs / 60, 2)} Minutes
            """
            )
        
        remaining_batches = len(events) - i - 1
        remaining_batch_hours = (av_batch_time_secs * remaining_batches) / 60 / 60
        
        logging.info(
            f"""
                Remaining Events to process for batch: {remaining_batches}, Est time remaining to process: {round(remaining_batch_hours, 2)} Hours
            """
            )

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 12 to 36
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   event_id   13 non-null     int64  
 1   sensor_id  13 non-null     int16  
 2   time       13 non-null     int64  
 3   charge     13 non-null     float64
dtypes: float64(1), int16(1), int64(2)
memory usage: 442.0 bytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 70 to 89
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   event_id   13 non-null     int64  
 1   sensor_id  13 non-null     int16  
 2   time       13 non-null     int64  
 3   charge     13 non-null     float64
dtypes: float64(1), int16(1), int64(2)
memory usage: 442.0 bytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 119 to 138
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  

: 

: 

## Train the model

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:

# load the data
training_file_path = 'artifacts/train/16-02-2023-20:49/batch_1.csv'
data = pd.read_csv(training_file_path)

targets=['azimuth', 'zenith']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(targets, axis=1), data[targets], test_size=0.2, random_state=42)

# define a pipeline to preprocess the input and train the model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier())
])

# fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# make predictions on the testing data
y_pred = pipeline.predict(X_test)

# evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


## Make the prediction

In [None]:
batch_ids = meta_dfd['batch_id'].unique().compute().values # type: ignore

In [None]:
end_time = time.time()
total_time = end_time - start_time
total_hours = total_time / 60 / 60
print("Total time taken: ", round(total_hours,2), "Hours")