# Simple PCA line prediction

This notebook uses PCA to predict the line of best fit to the data. It uses a single PCA component to predict the line of best fit to the data and reducing the dimensionality of the data to 1.

In [1]:
import sys
sys.path.append('..')


import pandas as pd 
from scripts.utils import seed_it_all
from scripts.plot import compose_event_df, plot_pca, get_event_true_values

In [2]:
from typing import Optional


DATA_DIR = "../data"
EVENT: Optional[int] = None # set to None to use a random event, or set to a specific event id
EXCLUDE_AUXILIARY= False # set to True to exclude auxiliary sensor readings
IS_TRAINING = True # set to True to use the training data, or False to use the test data


DATA_SET = 'train' if IS_TRAINING else 'test' # DO NOT CHANGE THIS LINE

In [10]:
seed_it_all(10)

In [11]:
meta_df = pd.read_parquet(f'{DATA_DIR}/{DATA_SET}_meta.parquet')
sensor_geometry = pd.read_csv(f'{DATA_DIR}/sensor_geometry.csv', index_col=0)

## Plot the result

Execute the following code to plot the predicted line vs truth.

In [19]:
# Get a random event_id and associated batch_id from meta_df
# If EVENT is given, use that instead of a random event_id
event_id, batch_id = \
    meta_df[['event_id', 'batch_id']].sample(n=1).values[0] \
    if not EVENT \
    else (
        EVENT, 
        meta_df[meta_df['event_id']==EVENT]['batch_id'].values[0]
    )
    
batch_df = pd.read_parquet(f'{DATA_DIR}/train/batch_{batch_id}.parquet', engine='fastparquet').reset_index()
event_df = compose_event_df(batch_df, event_id, sensor_geometry)
true_values = get_event_true_values(meta_df, event_id)
plot_pca(event_df=event_df, labels=true_values, exclude_axillary=EXCLUDE_AUXILIARY)