In [None]:
from itertools import product
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from math import ceil, floor
import os
from pickle import dump, load
import pandas as pd
import pyarrow.dataset as ds
import gc

In [None]:
def fill_step_on_train_events_when_nan(train_events): 
    last_value = 0

    for index, data in train_events.iterrows():
        if pd.isnull(data['step']):
            train_events.at[index, 'step'] = last_value + 1
        
        last_value = train_events.at[index, 'step']

    train_events["step"]  = train_events["step"].astype("int")
    
    return train_events

def engineer_awake_on_train_events(train_events):
    train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
    
    train_events["onset"] = train_events["event"].replace({"onset":1,"wakeup":0})
    train_events["onset"].fillna(0)
    
    train_events["wakeup"] = train_events["event"].replace({"onset":0,"wakeup":1})
    train_events["wakeup"].fillna(0)
    
    return train_events

def engineer_wearable_on_on_train_events(train_events):
    train_events['wearable_on'] = 1
    train_events.loc[train_events['step'].isna(), 'wearable_on'] = 0    

    return train_events

def engineer_awake_on_train(train):
    # final section:
    # train_events.groupby('series_id').tail(1)["event"].unique()
    # Result: the last event is always a "wakeup
    train["awake"].bfill(axis ='rows', inplace=True)
    train['awake'].fillna(1, inplace=True) # awake
    train["awake"] = train["awake"].astype("int")
    
    return train

def engineer_wearable_on_on_train(train):
    train["wearable_on"].bfill(inplace=True)
    train["wearable_on"].ffill(inplace=True)

    return train

def split_timestamp_on_train(train):
    train['timestamp'] = train['timestamp'].str[:-5]

    train['hour'] = pd.to_numeric(train['timestamp'].str[-8:-6])
    train['minute'] = pd.to_numeric(train['timestamp'].str[-5:-3])
    train['seconds'] = pd.to_numeric(train['timestamp'].str[-2:])

    train['day'] = pd.to_numeric(train['timestamp'].str[-11:-9])
    train['month'] = pd.to_numeric(train['timestamp'].str[-14:-12])
    train['year'] = pd.to_numeric(train['timestamp'].str[-20:-15])

    train.drop('timestamp', axis=1, inplace=True)

    return train

def engineer_anglez_features(train, periods):
    return engineer_sensor_features(train, periods, 'anglez')

def engineer_enmo_features(train, periods):
    return engineer_sensor_features(train, periods, 'enmo')

def engineer_sensor_features(train, periods, feature_name):
    train[f"{feature_name}_abs"] = abs(train[feature_name]).astype("float32")

    for period in periods:
        train = engineer_sensor_periods_features(train, period, feature_name)

    return train

def engineer_sensor_periods_features(train, periods, feature_name):
    train[f"{feature_name}_rolling_mean_{periods}"] = train[feature_name].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_sum_{periods}"] = train[feature_name].rolling(periods,center=True).sum().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_max_{periods}"] = train[feature_name].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_min_{periods}"] = train[feature_name].rolling(periods,center=True).min().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_std_{periods}"] = train[feature_name].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_median_{periods}"] = train[feature_name].rolling(periods,center=True).median().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_variance_{periods}"] = train[feature_name].rolling(periods,center=True).var().fillna(method="bfill").fillna(method="ffill").astype('float32')

    quantiles = [0.25, 0.75]
    for quantile in quantiles:
        train[f"{feature_name}_rolling_{int(quantile * 100)}th_percentile_{periods}"] = train[feature_name].rolling(periods,center=True).quantile(quantile).fillna(method="bfill").fillna(method="ffill").astype('float32')

    train[f"{feature_name}_diff_{periods}"] = train[feature_name].diff(periods=periods).fillna(method="bfill").astype('float32')

    train[f"{feature_name}_diff_rolling_mean_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_sum_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).sum().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_max_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_min_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).min().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_std_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_median_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).median().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_variance_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).var().fillna(method="bfill").fillna(method="ffill").astype('float32')

    quantiles = [0.25, 0.75]
    for quantile in quantiles:
        train[f"{feature_name}_diff_rolling_{int(quantile * 100)}th_percentile_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).quantile(quantile).fillna(method="bfill").fillna(method="ffill").astype('float32')

    return train

def get_train_series(series):
    train_series = pd.read_parquet("C:/Users/jonas.hodel/Downloads/train_series/train_series.parquet", filters=[('series_id','=',series)])
    train_events = pd.read_csv("C:/Users/jonas.hodel/Downloads/child-mind-institute-detect-sleep-states/train_events.csv").query('series_id == @series')

    train_events = engineer_wearable_on_on_train_events(train_events)

    train_events = fill_step_on_train_events_when_nan(train_events)

    train_events = engineer_awake_on_train_events(train_events)

    train = pd.merge(train_series, train_events[['step','awake', 'wearable_on']], on='step', how='left')

    train = engineer_awake_on_train(train)

    train = engineer_wearable_on_on_train(train)
    
    train = split_timestamp_on_train(train)

    train = engineer_anglez_features(train, [12, 60])

    train = engineer_enmo_features(train, [12, 60])

    return train


def methodFromJonas(batch_size, test_series_path):
    #train_series = pd.read_csv("C:/Users/jonas.hodel/Downloads/train_series/train_series.parquet")
    train_series = pd.read_csv(test_series_path)
    series_ids = train_series['series_id'].unique()

    batch = pd.DataFrame([])

    for series_id in series_ids:
        if batch.empty:
            batch = get_train_series(series_id)
        else:
            batch = pd.concat([batch, get_train_series(series_id)])

        if len(batch) >= batch_size:
            yield batch
            del batch

In [None]:


# def batched_dataloader(batch_size=100_000):
#     dataset = ds.dataset(TEST_DATA)
#     for fragment in dataset.get_fragments():
#         batches = fragment.to_batches(batch_size=batch_size) # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Fragment.html#pyarrow.dataset.Fragment.to_batches
#         for batch in batches:
#             yield batch.to_pandas()
#             del batch
#         del fragment

In [None]:
M1_PATH = 'models/model_1.pkl'
M2_PATH = 'models/model_2.pkl'

def load_model(name: str) -> RandomForestClassifier:
    with open(name, 'rb') as f:
        return load(f)

m1 = load_model(M1_PATH)
m2 = load_model(M1_PATH)

In [None]:
BATCH_SIZE = 1_000_000
TEST_DATA_PATH = ''
submission_df = pd.Dataframe(columns=['row_id','series_id','step','event','score'])

for batch in methodFromJonas(BATCH_SIZE, TEST_DATA_PATH):
    batch = batch.drop(['wearable_on', 'awake', 'series_id'])
    pred = m1.predict(batch, not_scaled=False)
    batch['pred_wearable'] = pred
    batch = batch[batch['pred_wearable'] == 1]
    pred_2 = m2.predict(batch, not_scaled=False)
    batch['pred_awake'] = pred_2

    # heuristics
    # transform for submission
    submission_df = pd.concat([submission_df, """TRANSFORMED"""])


In [None]:
submission_df.to_csv('submission.csv', index=False, encoding='utf-8', lineterminator='\n')