# Simple PCA line prediction

This notebook uses PCA to predict the line of best fit to the data. It uses a single PCA component to predict the line of best fit to the data and reducing the dimensionality of the data to 1.

In [188]:
import pandas as pd 
from utils import seed_it_all

In [189]:
DATA_DIR = "./data"
BATCH = 1
EVENT = 59
EXCLUDE_AUXILIARY= False

In [190]:
seed_it_all(10)

In [191]:
train_meta_df = pd.read_parquet(f'{DATA_DIR}/train_meta.parquet', engine='fastparquet')
train_meta_df['event_id'].to_numpy()

array([24, 41, 59, ..., 2147483617, 2147483626, 2147483627], dtype=object)

In [192]:
train_batch_df = pd.read_parquet(f'{DATA_DIR}/train/batch_{BATCH}.parquet', engine='fastparquet').reset_index()

In [193]:
if EXCLUDE_AUXILIARY:
    train_batch_df = train_batch_df[~train_batch_df['auxiliary']]

In [194]:
sensor_geometry = pd.read_csv(f'{DATA_DIR}/sensor_geometry.csv', index_col=0)

In [195]:
event_df = train_batch_df[train_batch_df['event_id'] == EVENT]
event_df = pd.merge(
    left = event_df,
    right = sensor_geometry,
    how='inner',
    on='sensor_id'
)

In [196]:
import numpy as np

x, y, z = event_df['x'], event_df['y'], event_df['z']
# x, y, z

In [197]:
coords = np.array((x,y,z)).T

In [198]:
from typing import List, Tuple
from sklearn.decomposition import PCA

def get_direction(coords: np.ndarray) -> np.ndarray:
    """
    Get the direction vector from a list of coordinates.
    """
    pca = PCA(n_components=1)
    pca.fit(coords) 
    direction_vector = pca.components_#type: ignore
    return direction_vector

In [199]:
direction_vector = get_direction(coords)

In [200]:
def get_line(origin, vector, extent):
    origin = np.array(origin)  # convert to NumPy array
    vector = np.array(vector)  # convert to NumPy array
    below_origin = origin - vector * extent
    above_origin = origin + vector * extent
    line = np.vstack((below_origin, above_origin))
    return line

In [201]:
origin = np.mean(coords, axis=0)
euclidean_distance = np.linalg.norm(coords - origin, axis=1)
extent = np.max(euclidean_distance)
line = get_line(origin, direction_vector, extent)
line

array([[  27.4203288 ,  487.78462344, -882.94015341],
       [-207.33421769, -291.14795677,  504.41793119]])

In [202]:
mean = np.mean(line, axis=0)
mean

array([ -89.95694444,   98.31833333, -189.26111111])

In [203]:
points = line - mean
np.sum(points, axis=0)

array([0., 0., 0.])

In [204]:
from utils import sphere_to_cartesian, adjust_sphere, cartesian_to_sphere

In [205]:

train_metadata=train_meta_df[(train_meta_df['event_id'] == EVENT) & (train_meta_df['batch_id'] == BATCH)]
azimuth = train_metadata['azimuth'].values[0]
zenith= train_metadata['zenith'].values[0]
x,y,z = sphere_to_cartesian(azimuth, zenith) #type: ignore
x,y,z

(0.26887871688526843, 0.6180777630599297, -0.738704348448664)

In [206]:
truth_trace = get_line([0,0,0], [[x,y,z]], extent)
truth_trace

array([[-216.21795194, -497.02523732,  594.02671645],
       [ 216.21795194,  497.02523732, -594.02671645]])

In [207]:
truth_trace[:,0], truth_trace[:,1], truth_trace[:,2]

(array([-216.21795194,  216.21795194]),
 array([-497.02523732,  497.02523732]),
 array([ 594.02671645, -594.02671645]))

## Plot the result

In [208]:
import plotly.graph_objects as go

fig3 = go.Figure(data = [
        go.Scatter3d(
            x=event_df['x'].to_numpy(), y=event_df['y'].to_numpy(), z=event_df['z'].to_numpy(),
            mode='markers',
            marker=dict(size=5, color=event_df['time'].to_numpy(), opacity=1),
            name="Detected"
        ),
        go.Scatter3d(
            x=line[:,0], y=line[:,1], z=line[:,2],
            marker=dict(
                size=4,
                color='red',
            ),
            line=dict(
                color='red',
                width=3
            ),
            name="Predicted"
        ),
        go.Scatter3d(
            x=truth_trace[:,0], y=truth_trace[:,1], z=truth_trace[:,2],
            marker=dict(
                size=4,
                color='green',
            ),
            line=dict(
                color='green',
                width=3
            ),
            name="Truth"
        )
    ])

# Set x and y axis ranges
fig3.update_xaxes(range=[-extent, extent])
fig3.update_yaxes(range=[-extent, extent])
# fig3.update_zaxes(range=[-extent, extent])

# Add a legend
fig3.update_layout(
    showlegend=True, 
    legend=dict(x=0, y=1),)

fig3.show()

In [209]:
from utils import angular_dist_score

In [210]:

az_pred, zen_pred = adjust_sphere(*cartesian_to_sphere(x,y,z))

angular_dist_score(az_true=azimuth,zen_true=zenith, az_pred=az_pred, zen_pred=zen_pred)

r 1.0 x2y2 0.45431588558303493


0.0

# Make prediction on test set

In [None]:
test_meta_df = pd.read_parquet(f'{DATA_DIR}/test_meta.parquet', engine='fastparquet')
test_meta_df

Unnamed: 0,batch_id,event_id,first_pulse_index,last_pulse_index
0,661,2092,0,298
1,661,7344,299,334
2,661,9482,335,377


In [None]:
test_batch_df = pd.read_parquet(f'{DATA_DIR}/test/batch_661.parquet', engine='fastparquet').reset_index()
test_batch_df

Unnamed: 0,event_id,sensor_id,time,charge,auxiliary
0,2092,4066,6170,1.275,True
1,2092,3512,6374,0.975,True
2,2092,897,6378,1.475,True
3,2092,2060,6590,0.925,True
4,2092,3072,6625,1.075,True
...,...,...,...,...,...
373,9482,1133,13334,0.675,True
374,9482,2190,14112,1.075,True
375,9482,2057,14713,0.975,True
376,9482,4486,14765,1.425,True


In [None]:
test_events = test_meta_df['event_id'].unique().tolist()
test_events

[2092, 7344, 9482]

In [None]:
def get_event_df(batch_df: pd.DataFrame, sensor_geometry: pd.DataFrame, event_id: str) -> pd.DataFrame:
    """
    Get a DataFrame for a specific event.

    Parameters:
    train_batch_df (pandas.DataFrame): The batch DataFrame.
    sensor_geometry (pandas.DataFrame): The sensor geometry DataFrame.
    event_id (str): The event identifier.

    Returns:
    pandas.DataFrame: A DataFrame containing data for the specified event.
    """
    event_df = batch_df[batch_df['event_id'] == event_id]
    event_df = pd.merge(
        left=event_df,
        right=sensor_geometry,
        how='inner',
        on='sensor_id'
    )
    return event_df

In [None]:
from functools import reduce

test_dfs = [ get_event_df(test_batch_df, sensor_geometry,x) for x in test_events ]
len(test_dfs)

3

In [91]:
results = []

for test_df in test_dfs:
    x, y, z = test_df['x'], test_df['y'], test_df['z']
    coords = np.array((x,y,z)).T
    direction_vector = get_direction(coords)
    print('direction', direction_vector)
    origin = np.mean(coords, axis=0)
    euclidean_distance = np.linalg.norm(coords - origin, axis=1)
    extent = np.max(euclidean_distance)
    line = get_line(origin, direction_vector, extent)
    x, y, z= line[:,0], line[:,1], line[:,2]
    x = x.max() - x.min()
    y = y.max() - y.min()
    z = z.max() - z.min()
    az_pred, zen_pred = adjust_sphere(*cartesian_to_sphere(x,y,z))
    results.append([az_pred, zen_pred])

results
    

direction [[0.39323753 0.7327785  0.55533767]]
Getting line with vector [[0.39323753 0.7327785  0.55533767]]
From origin [-260.73147157 -374.10337793 -400.83702341]
With extent 1178.3822972960133
Below origin [[ -724.11560973 -1237.59658629 -1055.23709843]]
Above origin [[202.65266659 489.38983044 253.56305161]]
Line [[ -724.11560973 -1237.59658629 -1055.23709843]
 [  202.65266659   489.38983044   253.56305161]]
r 2356.7645945920267 x2y2 3841381.5215826244
direction [[ 0.45656735 -0.63621559  0.62191316]]
Getting line with vector [[ 0.45656735 -0.63621559  0.62191316]]
From origin [ 62.28944444 -93.72694444  89.94861111]
With extent 703.6149893114816
Below origin [[-258.95818456  353.92388214 -347.63881045]]
Above origin [[ 383.53707345 -541.37777102  527.53603267]]
Line [[-258.95818456  353.92388214 -347.63881045]
 [ 383.53707345 -541.37777102  527.53603267]]
r 1407.2299786229635 x2y2 1214365.2067069958
direction [[0.37338752 0.6023134  0.70554966]]
Getting line with vector [[0.373387

[[1.0782689122931226, 0.9820273837031137],
 [0.9483359318159352, 0.8996128838424357],
 [1.0158567198575796, 0.7875978411980064]]

In [92]:
submission = pd.DataFrame(columns=['event_id', 'azimuth', 'zenith'])
for i, event in enumerate(test_events):
    new_row = pd.DataFrame({ 'event_id': [event], 'azimuth': [results[i][0]],'zenith': [results[i][1]]})
    submission = pd.concat([ new_row, submission.loc[:]])
    
submission

Unnamed: 0,event_id,azimuth,zenith
0,9482,1.015857,0.787598
0,7344,0.948336,0.899613
0,2092,1.078269,0.982027


In [93]:
submission.set_index('event_id').to_csv('./artifacts/submission.csv')