In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
import plotnine as p9


In [None]:
X_train = pd.read_csv("./data/external/train_logs.csv")

In [None]:
# no explicit record for a pause. pauses are omitted.
X_train = (
    X_train
    .sort_values(["id", "event_id"], ascending=[True, True])
    )

X_train['up_time_lag1'] = (
    X_train
    .groupby(['id'])
    ['up_time']
    .shift(1)
    )

X_train['preceding_pause_time'] = (
    X_train['down_time'] - X_train['up_time_lag1']
    )
# expect some negative pause times -- interpret as, no real pause
has_no_real_pause = X_train['preceding_pause_time'] <= 0
X_train.loc[has_no_real_pause, 'preceding_pause_time'] = None

# if pause exceeds threshold duration, a "burst" has ended
MS_PER_S = 1000
SECONDS_PER_BURST = 2

X_train['is_new_burst_start'] = (
    X_train['preceding_pause_time'] > MS_PER_S * SECONDS_PER_BURST
    ).astype(int)
X_train['is_new_burst_start'][0] = 1
X_train['burst_id'] = (
    X_train
    .groupby(['id'])
    ['is_new_burst_start']
    .cumsum()
    )
X_train['burst_time_start'] = (
    X_train
    .groupby(['id', 'burst_id'])
    ['down_time']
    .transform('min')
    )
X_train['burst_time_end'] = (
    X_train
    .groupby(['id', 'burst_id'])
    ['up_time']
    .transform('max')
    )
X_train['burst_duration'] = X_train['burst_time_end'] - X_train['burst_time_start']

In [None]:
# windows allow for time-sequence features
TOTAL_MIN = 30
SECONDS_PER_MIN = 60
SECONDS_PER_WINDOW = 30

X_train['window_30s'] = pd.cut(
    X_train['down_time'],
    bins=np.arange(
        0, 
        TOTAL_MIN * SECONDS_PER_MIN * MS_PER_S + 5*MS_PER_S*2, 
        SECONDS_PER_WINDOW * MS_PER_S
        )
    )

In [None]:
# summarize pause distr
MS_IN_PAUSE_BUCKET_MAX = 200e3
PAUSE_BUCKET_STEP_MS = 500

X_train['preceding_pause_time_bucket'] = pd.cut(
    X_train['preceding_pause_time'],
    bins=np.arange(
        0, 
        MS_IN_PAUSE_BUCKET_MAX,
        PAUSE_BUCKET_STEP_MS
        )
    )

X_train['preceding_pause_time_bucket'].value_counts()

# WARNING: this representation of pause distribution is dense & large
# a few parameters from distribution model far more succinct


In [None]:
ACTIVITY_CATEGORIES = ['Nonproduction', 'Input', 'Remove/Cut']

pipeline_activity_onehot = ColumnTransformer(
    transformers=[(
        'onehot_encode', 
        preprocessing.OneHotEncoder(
            categories=[ ACTIVITY_CATEGORIES ], 
            sparse=False, 
            handle_unknown='infrequent_if_exist'
            ),
        ["activity"]
    )],
    remainder='passthrough',
    verbose_feature_names_out=False
    )
pipeline_activity_onehot.fit(X_train)
original_categorical = X_train['activity']

X_train_dtypes = X_train.dtypes.to_dict()
X_train = pipeline_activity_onehot.transform(X_train)
X_train = pd.DataFrame(X_train, columns=pipeline_activity_onehot.get_feature_names_out())
X_train = pd.concat([X_train, original_categorical], axis=1)
X_train = X_train.astype(X_train_dtypes)

In [None]:
for activity in ACTIVITY_CATEGORIES:

    X_train['burst_action_time_' + activity] = (
        X_train
        .assign(activity_x_event_time = lambda x: x['activity_' + activity] * x.action_time)
        .groupby(['id', 'burst_id'])
        ['activity_x_event_time']
        .transform('sum')
        ).astype(float)
    
X_train['burst_type'] = (
    X_train
    [['burst_action_time_' + activity for activity in ACTIVITY_CATEGORIES]]
    .idxmax(axis=1)
    )
X_train['burst_type'] = (
    X_train['burst_type']
    .str
    .replace("burst_action_time_", "", regex=True)
    )

In [None]:
ACTIVITY_CATEGORIES = ['Nonproduction', 'Input', 'Remove/Cut']

pipeline_burst_type_onehot = ColumnTransformer(
    transformers=[(
        'onehot_encode', 
        preprocessing.OneHotEncoder(
            categories=[ ACTIVITY_CATEGORIES ], 
            sparse=False, 
            handle_unknown='infrequent_if_exist'
            ),
        ["burst_type"]
    )],
    remainder='passthrough',
    verbose_feature_names_out=False
    )
pipeline_burst_type_onehot.fit(X_train)
original_categorical = X_train['burst_type']

X_train_dtypes = X_train.dtypes.to_dict()
X_train = pipeline_burst_type_onehot.transform(X_train)
X_train = pd.DataFrame(X_train, columns=pipeline_burst_type_onehot.get_feature_names_out())
X_train = pd.concat([X_train, original_categorical], axis=1)
X_train = X_train.astype(X_train_dtypes)

In [None]:
for activity in ACTIVITY_CATEGORIES:

    X_train['is_new_burst_start_' + activity] = (
        X_train['is_new_burst_start'] * 
        X_train['burst_type_' + activity]
        )

In [None]:
X_train = X_train[[
    "id",
    "event_id",
    "window_30s",
    "burst_id",
    "burst_type",
    "burst_type_Nonproduction",
    "burst_type_Input",
    "burst_type_Remove/Cut",
    "is_new_burst_start",
    "is_new_burst_start_Nonproduction",
    "is_new_burst_start_Input",
    "is_new_burst_start_Remove/Cut",
    "burst_time_start",
    "burst_time_end",
    "burst_duration",

    "down_time",
    "up_time",	
    "action_time",	
    "activity",	
    "activity_Nonproduction",
    "activity_Input",
    "activity_Remove/Cut",
    "down_event",	
    "up_event",	
    "text_change",
    "cursor_position",	
    "word_count",

    "up_time_lag1",
    "preceding_pause_time",
    "preceding_pause_time_bucket",

    "burst_action_time_Nonproduction",
    "burst_action_time_Input",
    "burst_action_time_Remove/Cut"
    ]]


In [None]:
X_train_marginals_sum_wrt_time = (
    X_train
    .groupby('id')
    [
        ['activity_' + x for x in ACTIVITY_CATEGORIES] 
        + ['is_new_burst_start'] 
        + ['is_new_burst_start_' + x for x in ACTIVITY_CATEGORIES]
    ]
    .agg(sum)
    )
X_train_marginals_sum_wrt_time['delete_insert_ratio'] = (
    X_train_marginals_sum_wrt_time['activity_Remove/Cut'] / 
    X_train_marginals_sum_wrt_time['activity_Input'] 
    )

In [None]:
X_train_marginals_central_tendency_wrt_time = (
    X_train
    .groupby('id')
    .agg(
        pause_time_p50 = ('preceding_pause_time', np.median),
        burst_duration_mean = ('burst_duration', 'mean'),
        burst_duration_p50 = ('burst_duration', np.median)
        )
    )

In [None]:
X_train_marginals_extremes_wrt_time = (
    X_train
    .groupby('id')
    .agg(
        pause_time_max=('preceding_pause_time', 'max'),
        # approximation to, next longest pause after first long planning pause
        pause_time_p99=('preceding_pause_time', lambda x: x.quantile(0.99)),
        burst_duration_max=('burst_duration', 'max'),
        total_time=('up_time', 'max')
        )
    )

In [None]:
from scipy.stats import lognorm

pause_distr_summary_subjects = []

for X_train_subject in [x for _, x in X_train.groupby('id')]:

    shape, location, scale = lognorm.fit(X_train_subject['preceding_pause_time'].dropna())

    pause_distr_summary = pd.DataFrame({
        'pauses_lognorm_shape': [shape], 
        'pauses_lognorm_location': [location],
        'pauses_lognorm_scale': [scale]
        })
    pause_distr_summary.index = [X_train_subject['id'].iloc[0]]
    
    pause_distr_summary_subjects.append(pause_distr_summary)

X_train_marginals_distr_params_wrt_time = pd.concat(pause_distr_summary_subjects, axis=0)

In [None]:
X_train_marginals_wrt_time = pd.merge(
    X_train_marginals_sum_wrt_time, 
    X_train_marginals_central_tendency_wrt_time,
    how='left',
    left_index=True,
    right_index=True
    )

X_train_marginals_wrt_time = pd.merge(
    X_train_marginals_wrt_time, 
    X_train_marginals_extremes_wrt_time,
    how='left',
    left_index=True,
    right_index=True
    )

X_train_marginals_wrt_time = pd.merge(
    X_train_marginals_wrt_time, 
    X_train_marginals_distr_params_wrt_time,
    how='left',
    left_index=True,
    right_index=True
    )

X_train_marginals_wrt_time = (
    X_train_marginals_wrt_time
    .assign(writing_speed = lambda x: (x.activity_Input + x['activity_Remove/Cut']) / x.total_time)
    )

In [None]:
X_train_marginals_wrt_time.head()

In [None]:
X_train_marginals_wrt_time.isnull().sum()

In [None]:
X_train_by_window = (
    X_train
    .groupby(['id', 'window_30s'])
    [
        ['activity_' + x for x in ACTIVITY_CATEGORIES] 
        + ['is_new_burst_start'] 
        + ['is_new_burst_start_' + x for x in ACTIVITY_CATEGORIES]
    ]
    .agg(sum)
    .astype(float)
    .reset_index(drop=False)
    )

X_train_by_window['delete_insert_ratio'] = (
    X_train_by_window['activity_Remove/Cut'] / 
    X_train_by_window['activity_Input'] 
    )

X_train_by_window['window_30s_idx'] = X_train_by_window.index

In [None]:
X_train_windows_variation = (
    X_train_by_window
    .drop(columns=['window_30s', 'window_30s_idx'])
    .groupby(['id'])
    .agg(np.std)
    )

X_train_windows_variation.columns = [
    x + "_stddev"
    for x in X_train_windows_variation.columns
    ]

In [None]:
X_train_windows_variation.head()

In [None]:
X_train_transform = pd.merge(
    X_train_marginals_wrt_time,
    X_train_windows_variation,
    how='left',
    left_index=True,
    right_index=True
    )

In [None]:
X_train_transform

In [None]:
X_train_transform.isnull().mean()

In [None]:
(
    X_train_transform
    .drop(columns='delete_insert_ratio_stddev')
    .to_pickle("./data/processed/X_train.pkl")
)