# 02 Feature engineering

In [None]:
import pandas as pd
import os
import shutil

In [None]:
path_events_csv = '../data/train_events.csv'
path_sensor_parquet = '../data/train_series.parquet'
drop_path = '../data/engineered'

In [None]:
def fill_step_on_train_events_when_nan(train_events): 
    last_value = 0

    # loop over all event entries
    for index, data in train_events.iterrows():
        
        # check if step is set
        if pd.isnull(data['step']):

            # set step with previous value + 1
            train_events.at[index, 'step'] = last_value + 1
        
        # update last step value
        last_value = train_events.at[index, 'step']

    # set datatype for step
    train_events["step"]  = train_events["step"].astype("int")
    
    return train_events

In [None]:
def engineer_awake_on_train_events(train_events):
    # set awake = 1 when onset event and awake = 0 when wakeup event
    train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
    
    # set onset = 1 when onset event and onset = 0 when wakeup event
    train_events["onset"] = train_events["event"].replace({"onset":1,"wakeup":0})
    # fill null values in onset with 0
    train_events["onset"].fillna(0)
    
    # set wakeup = 1 when onset event and wakeup = 0 when wakeup event
    train_events["wakeup"] = train_events["event"].replace({"onset":0,"wakeup":1})
    # fill null values in wakeup with 0
    train_events["wakeup"].fillna(0)
    
    return train_events

In [None]:
def engineer_wearable_on_on_train_events(train_events):
    # init new feature wearable_on and set it to 1 for all events
    train_events['wearable_on'] = 1
    # if a event has no step then set the wearable_on to 0
    train_events.loc[train_events['step'].isna(), 'wearable_on'] = 0    

    return train_events

In [None]:
def engineer_awake_on_train(train):
    # fill the null values in wake
    train["awake"].bfill(axis ='rows', inplace=True)
    train['awake'].fillna(1, inplace=True)
    
    # set datatype for awake
    train["awake"] = train["awake"].astype("int")
    
    return train

In [None]:
def engineer_wearable_on_on_train(train):
    # define new temporary feature which fills the null values in wearable_on with bfill first
    train["wearable_on_temp1"] = train["wearable_on"]
    train["wearable_on_temp1"].bfill(inplace=True)
    train["wearable_on_temp1"].ffill(inplace=True)

    # define new temporary feature which fills the null values in wearable_on with ffill first
    train["wearable_on_temp2"] = train["wearable_on"]
    train["wearable_on_temp2"].ffill(inplace=True)
    train["wearable_on_temp2"].bfill(inplace=True)

    # define 5min rolling window in both directions and calculate the std
    train['enmo_5min_std_forward'] = train["enmo"].rolling(720).std()
    train['enmo_5min_std_backward'] = train["enmo"][::-1].rolling(720).std()

    # calculate the average std over both rolling windows
    train['enmo_5min_std'] = (train['enmo_5min_std_backward'] + train['enmo_5min_std_forward']) / 2

    # define wearable_on given the temporary engineered features
    train["wearable_on"] = ((train["wearable_on_temp1"] == 1) & (train["wearable_on_temp2"] == 1)) | (train['enmo_5min_std'] > 0.05)

    # drop temporary features
    train.drop('wearable_on_temp1', axis='columns', inplace=True)
    train.drop('wearable_on_temp2', axis='columns', inplace=True)
    train.drop('enmo_5min_std_forward', axis='columns', inplace=True)
    train.drop('enmo_5min_std_backward', axis='columns', inplace=True)
    train.drop('enmo_5min_std', axis='columns', inplace=True)

    return train

In [None]:
def split_timestamp_on_train(train):
    # split the timestamp into features
    train['hour'] = train['timestamp'].dt.hour
    train['minute'] = train['timestamp'].dt.minute
    train['seconds'] = train['timestamp'].dt.second

    train['day'] = train['timestamp'].dt.day
    train['month'] = train['timestamp'].dt.month
    train['year'] = train['timestamp'].dt.year

    # drop timestamp
    train.drop('timestamp', axis=1, inplace=True)

    train = train.reset_index()

    train.drop('timestamp', axis=1, inplace=True)

    return train

In [None]:
def engineer_anglez_features(train, periods):
    # engineer features for anglez
    return engineer_sensor_features(train, periods, 'anglez')

In [None]:
def engineer_enmo_features(train, periods):
    # engineer features for enmo
    return engineer_sensor_features(train, periods, 'enmo')

In [None]:
def engineer_sensor_features(train, periods, feature_name):
    # engineer absolut value feature
    train[f"{feature_name}_abs"] = abs(train[feature_name]).astype("float32")

    # engineer rolling windows
    for period in periods:
        train = engineer_sensor_periods_features(train, period, feature_name)

    return train

In [None]:
def engineer_sensor_periods_features(train, periods, feature_name):
    train[f"{feature_name}_rolling_mean_{periods}"] = train[feature_name].rolling(periods,center=False).mean().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_rolling_sum_{periods}"] = train[feature_name].rolling(periods,center=True).sum().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_max_{periods}"] = train[feature_name].rolling(periods,center=False).max().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_rolling_min_{periods}"] = train[feature_name].rolling(periods,center=True).min().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_rolling_std_{periods}"] = train[feature_name].rolling(periods,center=False).std().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_rolling_median_{periods}"] = train[feature_name].rolling(periods,center=True).median().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_rolling_variance_{periods}"] = train[feature_name].rolling(periods,center=True).var().fillna(method="bfill").fillna(method="ffill").astype('float32')

    # quantiles = [0.25, 0.75]
    # for quantile in quantiles:
    #     train[f"{feature_name}_rolling_{int(quantile * 100)}th_percentile_{periods}"] = train[feature_name].rolling(periods,center=True).quantile(quantile).fillna(method="bfill").fillna(method="ffill").astype('float32')

    train[f"{feature_name}_diff_{periods}"] = train[feature_name].diff(periods=periods).fillna(method="bfill").astype('float32')

    train[f"{feature_name}_diff_rolling_mean_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=False).mean().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_diff_rolling_sum_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).sum().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_max_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=False).max().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_diff_rolling_min_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).min().fillna(method="bfill").fillna(method="ffill").astype('float32')
    train[f"{feature_name}_diff_rolling_std_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=False).std().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_diff_rolling_median_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).median().fillna(method="bfill").fillna(method="ffill").astype('float32')
    # train[f"{feature_name}_diff_rolling_variance_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).var().fillna(method="bfill").fillna(method="ffill").astype('float32')

    # quantiles = [0.25, 0.75]
    # for quantile in quantiles:
    #     train[f"{feature_name}_diff_rolling_{int(quantile * 100)}th_percentile_{periods}"] = train[f"{feature_name}_diff_{periods}"].rolling(periods,center=True).quantile(quantile).fillna(method="bfill").fillna(method="ffill").astype('float32')

    return train

In [None]:
def bin_data(train):
    train['timestamp'] = pd.to_datetime(train['timestamp'].str[:-5])

    custom_agg_function = lambda x: round(sum(x) / len(x)) if len(x) > 0 else 1

    # bin the data into 1Min blocks
    binned_df = train.resample('1Min', on='timestamp').agg({
        'series_id': 'first',
        'step': 'first',
        'awake': custom_agg_function,
        'wearable_on': custom_agg_function,
        'anglez': 'mean',
        'enmo': 'mean',
        'timestamp': 'first'
    })

    binned_df.dropna(inplace=True)

    return binned_df

In [None]:
def shift_previous_and_next_nights(train):
    entries_one_night = 1440
    entires_two_nights = 2880

    # get all columns to compare with
    enmo_column_names = [col for col in train.columns if 'enmo' in col]
    anglez_column_names = [col for col in train.columns if 'anglez' in col]
    
    column_names = enmo_column_names + anglez_column_names

    # define the new column names
    new_columns = [(col, 
                    f'shift_1_d_past_{col}', 
                    f'shift_diff_1_d_past_{col}', 
                    f'shift_2_d_past_{col}', 
                    f'shift_diff_2_d_past_{col}',
                    f'shift_1_d_future_{col}',
                    f'shift_diff_1_d_future_{col}',
                    f'shift_2_d_future_{col}',
                    f'shift_diff_2_d_future_{col}'
                    ) for col in column_names]
    
    for (column_name, 
        shift_1_d_past, 
        shift_diff_1_d_past, 
        shift_2_d_past, 
        shift_diff_2_d_past, 
        shift_1_d_future, 
        shift_diff_1_d_future, 
        shift_2_d_future, 
        shift_diff_2_d_future) in new_columns: 

        # create the new lag features with diffrence
        train[shift_1_d_past] = train.shift(-entries_one_night)[column_name]
        train[shift_diff_1_d_past] = train[shift_1_d_past] - train[column_name]

        train[shift_2_d_past] = train.shift(-entires_two_nights)[column_name]
        train[shift_diff_2_d_past] = train[shift_2_d_past] - train[column_name]

        train[shift_1_d_future] = train.shift(-entries_one_night)[column_name]
        train[shift_diff_1_d_future] = train[shift_1_d_future] - train[column_name]

        train[shift_2_d_future] = train.shift(-entires_two_nights)[column_name]
        train[shift_diff_2_d_future] = train[shift_2_d_future] - train[column_name]

    train.dropna(inplace=True)


    return train

In [None]:
def get_train_series(series):
    train_series = pd.read_parquet(path_sensor_parquet, filters=[('series_id','=',series)])
    
    train_events = pd.read_csv(path_events_csv).query('series_id == @series')

    train_events = engineer_wearable_on_on_train_events(train_events)

    train_events = fill_step_on_train_events_when_nan(train_events)

    train_events = engineer_awake_on_train_events(train_events)

    train = pd.merge(train_series, train_events[['step','awake', 'wearable_on']], on='step', how='left')

    train = engineer_awake_on_train(train)

    train = engineer_wearable_on_on_train(train)

    train = bin_data(train)
    
    train = split_timestamp_on_train(train)

    train = engineer_anglez_features(train, [5, 30, 120, 480])

    train = engineer_enmo_features(train, [5, 30, 120, 480])

    # train = shift_previous_and_next_nights(train)

    return train

## Exectute feature engineering

In [None]:
batch_size = 1_000_000
train_events = pd.read_csv(path_events_csv)

series_ids = train_events['series_id'].unique()

batch = pd.DataFrame([])

series_count = 0
batch_count = 0
for series_id in series_ids:
    print(f'{series_count} {series_id}')
    if batch.empty:
        batch = get_train_series(series_id)
    else:
        batch = pd.concat([batch, get_train_series(series_id)])

        if len(batch) >= batch_size:
            batch.to_parquet(f'{drop_path}/{batch_count}.parquet')
            batch = pd.DataFrame([])
            batch_count += 1

    series_count += 1

batch.to_parquet(f'{drop_path}/{batch_count}.parquet')

## Split into val and train

In [None]:
files = os.listdir(drop_path)

val_fence = round(len(files) * 0.2)

for filename in files:
    if filename.endswith('.parquet'):
        if int(filename.removesuffix('.parquet')) < val_fence:
            shutil.move(f'{drop_path}/{filename}', f'{drop_path}/val/{filename}')
        else: 
            shutil.move(f'{drop_path}/{filename}', f'{drop_path}/train/{filename}')