## Notebook to create submission and calculate local score

This Jupyter notebook performs the state predictions and extracts the events based on the preprocessed data and the trained model. Afterwards it calcualts the mean average precision score and exports the events as submission.csv.

> Before running this notebook, please ensure that the directory `../data/engineered/train` contains the trainig data and that `../data/engineered/val` contains the validation data from the feature engineering notebook. Also make sure that the model and the scaler are trained from the model notebook.

### Install packages

In [None]:
%pip install matplotlib joblib pandas numpy pyarrow

### Imports

In [None]:
from joblib import load
import pandas as pd
import pyarrow.dataset as ds
from score import *
from event_extraction_function import *
from plot_function import *
from columns_drop import *
from warnings import filterwarnings

filterwarnings('ignore')

### Load model and data

In [None]:
def load_model(name: str):
    with open(name, 'rb') as f:
        return load(f)

In [None]:
# load model and scaler
M1_PATH = 'models/model_randomforestclassifier-both-n_estimators__500-max_depth__40-min_samples_leaf__15-random_state__42-n_jobs__10-warm_start__true.jlb'
S_PATH  = 'models/scaler_randomforestclassifier-wrist-n_estimators__500-max_depth__20-min_samples_leaf__15-random_state__42-n_jobs__10-warm_start__true.jlb'

m1 = load_model(M1_PATH)
s  = load_model(S_PATH)

In [None]:
# load data
NEW_TRAIN_DATA = '../data/engineered/train'
NEW_VALIDATION_DATA = '../data/engineered/val'

### Predicting states und extracting events

In [None]:
def batched_dataloader(validation=True, batch_size=100_000):
    dataset = ds.dataset(NEW_VALIDATION_DATA if validation else NEW_TRAIN_DATA)
    batch = pd.DataFrame()
    for file_batch in dataset.to_batches(batch_size=batch_size):
        batch = pd.concat([batch, file_batch.to_pandas()])
        if len(batch) >= batch_size:
            yield batch.reset_index(drop=True)
            batch = pd.DataFrame()
    yield batch.reset_index(drop=True)

In [None]:
BATCH_SIZE = 10_000_000

submission_df = pd.DataFrame(columns=['series_id', 
                                    'step', 
                                    'event', 
                                    'score', 
                                    'probability', 
                                    'timestamp',
                                    'enmo',
                                    'remove_events'])



# True for validation, False for training
for batch in batched_dataloader(True, BATCH_SIZE):

    batch.reset_index(inplace=True, drop=True)
    series_id_minutes = batch[['series_id','minute']]
    batch = batch.drop(columns_to_drop, axis=1)
    batch = pd.DataFrame(s.transform(batch), columns=batch.columns)
    
    # predict probability for awake
    pred = m1.predict_proba(batch)
    batch = pd.DataFrame(s.inverse_transform(batch), columns=batch.columns)
    batch['probability'] = pred[:, 1] 
    
    # define where to make the cut
    batch['pred_awake'] = batch['probability'].apply(lambda x: 1 if x >= 0.65 else 0)
    
    # prepare df for event extraction function
    batch['series_id'] = series_id_minutes['series_id']
    batch['minute'] = series_id_minutes['minute']
    
    # apply heuristic: mean_boolean decides if score should be calculated by the mean of the probabilites
    pre_sub = heuristic_function(batch, period_1 = 30, period_2 = 30)
    
    # append to submission_df
    submission_df = pd.concat([submission_df, pre_sub[['series_id', 'step', 'event', 'score', 'probability', 'timestamp', 'enmo', 'remove_events']]])
    
    # remove events in nights with repetitve movement patterns
    submission_df = submission_df[submission_df['remove_events'] != True]


submission_df = submission_df.reset_index(drop=True).reset_index(names="row_id")

### Load solution

In [None]:
solution = pd.read_csv('../data/train_events.csv')
solution.dropna(subset=['step'], inplace=True)
sample_val = solution[solution['series_id'].isin(submission_df.series_id.unique())]

### Calculate score

In [None]:
sleep_tolerances = {
    'onset': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],  
    'wakeup': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]  
}
score = score(
        solution = sample_val,
        submission = submission_df,
        tolerances = sleep_tolerances,
        series_id_column_name = 'series_id',
        time_column_name = 'step',
        event_column_name = 'event',
        score_column_name = 'score',
        use_scoring_intervals =False,
)
score

### Visualize the submission vs. the solution

In [None]:
# for a visualization of all series remove [:100]
train_series = pd.read_parquet("../data/train_series.parquet")
for series_id in sample_val['series_id'][:100].unique():
    plot_whole_series(series_id, train_series, sample_val, submission_df, 'lightgrey', 'blue', 'green', font_size=20)

### Export the submission

In [None]:
submission_df.drop(['probability', 'timestamp', 'enmo', 'remove_events'], axis = 1, inplace=True)
submission_df.to_csv('submission.csv', index=False, encoding='utf-8', lineterminator='\n')