In [22]:
import time

start_time = time.time()

# Train a model using XGBoost

## Imports

In [23]:
# Standard library imports
import os
import random
import math

# Third-party library imports
import numpy as np
import pandas as pd
import dask.dataframe as dd

# Typing imports
from typing import List, Tuple


## Variables

In [122]:
# DATA_DIR = "/kaggle/input/icecube-neutrinos-in-deep-ice"
DATA_DIR = "data"
BATCH = 1
EVENT = 24
PULSE_AMOUNT=200
EXCLUDE_AUXILIARY= True
SHOW_DF=True
IS_TRAINING=True
SET='train' if IS_TRAINING == True else 'test'

## Functions

In [25]:
def seed_it_all(seed=7):
    """ Attempt to be Reproducible """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
seed_it_all(10)

### For optimization

In [82]:
import importlib

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        print(f'Optimizing col {col}')
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

def import_data(file: str):
    """create a dataframe and optimize its memory usage"""
    function_name = f"read_{file.split('.')[-1]}"
    function = getattr(pd, function_name)
    df = function(file)
    df = reduce_mem_usage(df)
    return df

def get_event_df(batch_df: dd.DataFrame, sensor_geometry: pd.DataFrame, event_id: int) -> pd.DataFrame:
    """
    Get a DataFrame for a specific event.

    Parameters:
    train_batch_df (pandas.DataFrame): The batch DataFrame.
    sensor_geometry (pandas.DataFrame): The sensor geometry DataFrame.
    event_id (str): The event identifier.

    Returns:
    pandas.DataFrame: A DataFrame containing data for the specified event.
    """
    # print('batch_df in', batch_df.head())
    if EXCLUDE_AUXILIARY:
        batch_df = batch_df[~batch_df['auxiliary']].drop(columns=['auxiliary'])
    
    event_df = batch_df[batch_df['event_id'] == event_id].compute()
    # print('event_df out', event_df.head())
    event_df = pd.merge(
        left=event_df,
        right=sensor_geometry,
        how='inner',
        on='sensor_id'
    )
    
    ## Drop columns that are not needed for prediction
    return event_df.drop(columns=['event_id', 'sensor_id'])

### For Geometry

In [83]:
def cartesian_to_sphere(x: float, y: float, z: float) -> Tuple[float, float]:
    """Maps vector cartesian coordinates (x, y, z) from the origin to spherical angles azimuth and zenith.
    
    See: https://en.wikipedia.org/wiki/Spherical_coordinate_system

    Args:
        x (float): The x-coordinate of the point.
        y (float): The y-coordinate of the point.
        z (float): The z-coordinate of the point.

    Returns:
        tuple[float, float]: The azimuth and zenith angles in radians.
    """
    x2y2 = x**2 + y**2
#     print('cartesian_to_sphere x**2 + y**2', x2y2)
    r = math.sqrt(x2y2 + z**2)
#     print('cartesian_to_sphere math.sqrt(x2y2 + z**2)', r)
    x_dv_py = 0 if x == 0 else x / math.sqrt(x2y2)
    azimuth = math.acos(x_dv_py) * np.sign(y)
#     print('math.acos(x / math.sqrt(x2y2)) * np.sign(y)', azimuth)
    zenith = math.acos(z / r)
#     print('zenith', zenith)
    
#     print('cartesian_to_sphere takes',x,y,z)
#     print('cartesian_to_sphere returns',azimuth, zenith)
    return azimuth, zenith


def sphere_to_cartesian(azimuth: float, zenith: float) -> Tuple[float, float, float]:
    """Map spherical coordinates to cartesian coordinates.
    see: https://stackoverflow.com/a/10868220/4521646
    
    Args:
        azimuth (float): The azimuth angle in radians.
        zenith (float): The zenith angle in radians.

    Returns:
        tuple: The x, y, z vector cartesian coordinates of the point from the origin.
    """
    x = math.sin(zenith) * math.cos(azimuth)
    y = math.sin(zenith) * math.sin(azimuth)
    z = math.cos(zenith)
    return x, y, z


def adjust_sphere(azimuth:float, zenith:float) -> Tuple[float, float]:
    """Adjust azimuth and zenith to be within [-pi, pi]

    Args:
        azimuth (float): The azimuth to adjust
        zenith (float): The zenith to adjust

    Returns:
        float: The adjusted azimuth and zenith
    """
    print('adjust_sphere takes',azimuth, zenith)
    
    if zenith < 0:
        zenith += math.pi
        azimuth += math.pi
    if azimuth < 0:
        azimuth += math.pi * 2
    azimuth = azimuth % (2 * math.pi)
#     print('adjust_sphere returns',azimuth, zenith)
    return azimuth, zenith

### For scoring

In [84]:
def angular_dist_score(az_true:float, zen_true:float, az_pred:float, zen_pred:float):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    The lower the angle, the more similar the two vectors are meaning the score is better.
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two Cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against numerical instability
    # that might otherwise occur from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

## Load the dataframes

In [85]:
sensor_geometry = import_data(f'{DATA_DIR}/sensor_geometry.csv')

Optimizing col sensor_id
Optimizing col x
Optimizing col y
Optimizing col z
Memory usage after optimization is: 0.04 MB
Decreased by 74.9%


In [86]:

meta_dfd = dd.read_parquet(f'{DATA_DIR}/{SET}_meta.parquet', 
    blocksize=64000000 # = 64 Mb chunks
)

## Test input preparation

In [87]:
test_batch_dfd = dd.read_parquet(f'{DATA_DIR}/{SET}/batch_1.parquet', 
        blocksize=64000000 # = 64 Mb chunks,
    ).reset_index()


test_batch_dfd.head(1)


Unnamed: 0,event_id,sensor_id,time,charge,auxiliary
0,24,3918,5928,1.325,True


In [95]:
event_df = get_event_df(test_batch_dfd, sensor_geometry, 24)
event_df

Unnamed: 0,time,charge,x,y,z
0,9868,1.375,-9.679688,-79.5,-219.5
1,12201,0.225,35.53125,-364.75,191.375
2,12339,1.225,35.53125,-364.75,191.375
3,12206,1.225,35.53125,-364.75,208.375
4,12227,0.975,35.53125,-364.75,208.375
5,12377,0.725,35.53125,-364.75,208.375
6,12436,0.175,35.53125,-364.75,208.375
7,12607,0.375,35.53125,-364.75,276.5
8,12723,0.475,-43.28125,-267.5,296.0
9,12773,1.125,-43.28125,-267.5,278.75


In [None]:

if len(event_df) < PULSE_AMOUNT:
    blank_df = pd.DataFrame(index=range(len(event_df), PULSE_AMOUNT), columns=event_df.columns)
    event_df = pd.concat([df, blank_df], ignore_index=True)
elif len(event_df) > PULSE_AMOUNT:
    event_df = event_df.head(PULSE_AMOUNT)

In [108]:
# only get the first 200 pulses
event_df = event_df[:200]
event_df

Unnamed: 0,time,charge,x,y,z
0,9868,1.375,-9.679688,-79.5,-219.5
1,12201,0.225,35.53125,-364.75,191.375
2,12339,1.225,35.53125,-364.75,191.375
3,12206,1.225,35.53125,-364.75,208.375
4,12227,0.975,35.53125,-364.75,208.375
5,12377,0.725,35.53125,-364.75,208.375
6,12436,0.175,35.53125,-364.75,208.375
7,12607,0.375,35.53125,-364.75,276.5
8,12723,0.475,-43.28125,-267.5,296.0
9,12773,1.125,-43.28125,-267.5,278.75


In [104]:
stacked = event_df.stack().reset_index()
stacked

Unnamed: 0,level_0,level_1,0
0,0,time,9868.000000
1,0,charge,1.375000
2,0,x,-9.679688
3,0,y,-79.500000
4,0,z,-219.500000
...,...,...,...
60,12,time,13262.000000
61,12,charge,0.725000
62,12,x,35.531250
63,12,y,-364.750000


In [121]:
stacked['id'] = stacked['level_0'].astype(str) + '_' +stacked['level_1']
stacked

Unnamed: 0,level_0,level_1,0,id
0,0,time,9868.000000,0_time
1,0,charge,1.375000,0_charge
2,0,x,-9.679688,0_x
3,0,y,-79.500000,0_y
4,0,z,-219.500000,0_z
...,...,...,...,...
60,12,time,13262.000000,12_time
61,12,charge,0.725000,12_charge
62,12,x,35.531250,12_x
63,12,y,-364.750000,12_y


In [109]:
separator = '_'
result = [separator.join([str(elem) for elem in tup]) for tup in stacked.index.to_flat_index()]
result

TypeError: 'int' object is not iterable

## Train the model

In [89]:
submission = pd.DataFrame(columns=['event_id', 'azimuth', 'zenith'])

for batch_id in meta_dfd['batch_id'].unique().compute().values:
    
    print('Processing batch ', batch_id)
    
    batch_dfd = dd.read_parquet(f'{DATA_DIR}/{SET}/batch_{batch_id}.parquet', 
        blocksize=64000000 # = 64 Mb chunks,
    ).reset_index()
    
        
    # Loop through unique event IDs
    for event_id in batch_dfd['event_id'].unique().compute().values:
        
        print('Processing event', event_id, ' in batch ', batch_id)
        event_df = get_event_df(batch_id,sensor_geometry, event_id)
        
        submission = pd.concat([ new_row, submission.loc[:]]) 
        current_time = time.time() 
        print("Total time taken so far : ", current_time - start_time, "seconds")
        
        


Processing batch  1
Processing event 24  in batch  1


IndexError: invalid index to scalar variable.

## Make the prediction

In [80]:
batch_ids = meta_dfd['batch_id'].unique().compute().values # type: ignore