In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
import plotnine as p9


In [2]:
X_train = pd.read_csv("./data/external/train_logs.csv")

X_train = (
    X_train
    .sort_values(["id", "event_id"], ascending=[True, True])
    )

In [3]:
# no explicit record for a pause. pauses are omitted.
PAUSE_THRESHOLD_MS = 1000

X_train['up_time_lag1'] = (
    X_train
    .groupby(['id'])
    ['up_time']
    .shift(1)
    )
X_train['preceding_pause_time'] = (
    X_train['down_time'] - X_train['up_time_lag1']
    )
# expect some negative pause times -- interpret as, no real pause
has_no_real_pause = X_train['preceding_pause_time'] <= PAUSE_THRESHOLD_MS
X_train.loc[has_no_real_pause, 'preceding_pause_time'] = None

In [4]:
# if pause exceeds threshold duration, a "burst" has ended
MS_PER_S = 1000
SECONDS_PER_BURST = 2

X_train['is_new_burst_start'] = (
    X_train['preceding_pause_time'] > MS_PER_S * SECONDS_PER_BURST
    ).astype(int)
X_train['is_new_burst_start'][0] = 1
X_train['burst_id'] = (
    X_train
    .groupby(['id'])
    ['is_new_burst_start']
    .cumsum()
    )
X_train['burst_time_start'] = (
    X_train
    .groupby(['id', 'burst_id'])
    ['down_time']
    .transform('min')
    )
X_train['burst_time_end'] = (
    X_train
    .groupby(['id', 'burst_id'])
    ['up_time']
    .transform('max')
    )
X_train['burst_duration'] = X_train['burst_time_end'] - X_train['burst_time_start']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
# word count offers a productivity measure
X_train['word_count_lag1'] = (
    X_train
    .groupby(['id'])
    ['word_count']
    .shift(1)
    )

X_train['word_count_delta_event'] = (
    X_train['word_count'] - X_train['word_count_lag1']
    )

X_train['word_count_delta_burst'] = (
    X_train
    .groupby(['id', 'burst_id'])
    ['word_count_delta_event']
    .transform('sum')
    )

In [6]:
# one-way cursor movement might be most productive
# jumping around is choppy
X_train['cursor_position_lag1'] = (
    X_train
    .groupby(['id'])
    ['cursor_position']
    .shift(1)
    )

X_train['has_cursor_position_moved_right'] = (
    X_train['cursor_position'] > X_train['cursor_position_lag1']
    ).astype(int)

# farthest position cursor has _edited_, with recorded input
X_train['cursor_position_cummax'] = (
    X_train
    .groupby(['id'])
    ['cursor_position']
    .cummax()
    )
X_train.loc[X_train['activity'] != 'Input', 'cursor_position_cummax'] = None
X_train['cursor_position_cummax'] = (
    X_train
    .groupby(['id'])
    ['cursor_position_cummax']
    .ffill()
    )

X_train['cursor_position_vs_max'] = (
    X_train['cursor_position'] - X_train['cursor_position_cummax']
    )

In [7]:
# if thoughts aren't separated by punctuation, writing won't score well
X_train['is_thought_delimiting_punctuation'] = (
    (X_train['text_change'] == ".")
    | (X_train['text_change'] == ",")
    | (X_train['text_change'] == "-")
    | (X_train['text_change'] == "!")
    | (X_train['text_change'] == ";")
    | (X_train['text_change'] == "?")
    ).astype(int)

In [8]:
# windows allow for time-sequence features
TOTAL_MIN = 30
SECONDS_PER_MIN = 60
SECONDS_PER_WINDOW = 30

X_train['window_30s'] = pd.cut(
    X_train['down_time'],
    bins=np.arange(
        0, 
        TOTAL_MIN * SECONDS_PER_MIN * MS_PER_S + 5*MS_PER_S*2, 
        SECONDS_PER_WINDOW * MS_PER_S
        )
    )

In [9]:
# summarize pause distr
MS_IN_PAUSE_BUCKET_MAX = 200e3
PAUSE_BUCKET_STEP_MS = 500

X_train['preceding_pause_time_bucket'] = pd.cut(
    X_train['preceding_pause_time'],
    bins=np.arange(
        0, 
        MS_IN_PAUSE_BUCKET_MAX,
        PAUSE_BUCKET_STEP_MS
        )
    )

X_train['preceding_pause_time_bucket'].value_counts()

# WARNING: this representation of pause distribution is dense & large
# a few parameters from distribution model far more succinct


preceding_pause_time_bucket
(1000.0, 1500.0]        126117
(1500.0, 2000.0]         62487
(2000.0, 2500.0]         37948
(2500.0, 3000.0]         25823
(3000.0, 3500.0]         18885
                         ...  
(194000.0, 194500.0]         0
(184500.0, 185000.0]         0
(175000.0, 175500.0]         0
(500.0, 1000.0]              0
(0.0, 500.0]                 0
Name: count, Length: 399, dtype: int64

In [10]:
ACTIVITY_CATEGORIES = ['Nonproduction', 'Input', 'Remove/Cut', 'Replace', 'Paste']

pipeline_activity_onehot = ColumnTransformer(
    transformers=[(
        'onehot_encode', 
        preprocessing.OneHotEncoder(
            categories=[ ACTIVITY_CATEGORIES ], 
            sparse=False, 
            handle_unknown='infrequent_if_exist'
            ),
        ["activity"]
    )],
    remainder='passthrough',
    verbose_feature_names_out=False
    )
pipeline_activity_onehot.fit(X_train)
original_categorical = X_train['activity']

X_train_dtypes = X_train.dtypes.to_dict()
X_train = pipeline_activity_onehot.transform(X_train)
X_train = pd.DataFrame(X_train, columns=pipeline_activity_onehot.get_feature_names_out())
X_train = pd.concat([X_train, original_categorical], axis=1)
X_train = X_train.astype(X_train_dtypes)



In [11]:
for activity in ACTIVITY_CATEGORIES:

    X_train['burst_action_time_' + activity] = (
        X_train
        .assign(activity_x_event_time = lambda x: x['activity_' + activity] * x.action_time)
        .groupby(['id', 'burst_id'])
        ['activity_x_event_time']
        .transform('sum')
        ).astype(float)
    
X_train['burst_type'] = (
    X_train
    [['burst_action_time_' + activity for activity in ACTIVITY_CATEGORIES]]
    .idxmax(axis=1)
    )
X_train['burst_type'] = (
    X_train['burst_type']
    .str
    .replace("burst_action_time_", "", regex=True)
    )

In [12]:
pipeline_burst_type_onehot = ColumnTransformer(
    transformers=[(
        'onehot_encode', 
        preprocessing.OneHotEncoder(
            categories=[ ACTIVITY_CATEGORIES ], 
            sparse=False, 
            handle_unknown='infrequent_if_exist'
            ),
        ["burst_type"]
    )],
    remainder='passthrough',
    verbose_feature_names_out=False
    )
pipeline_burst_type_onehot.fit(X_train)
original_categorical = X_train['burst_type']

X_train_dtypes = X_train.dtypes.to_dict()
X_train = pipeline_burst_type_onehot.transform(X_train)
X_train = pd.DataFrame(X_train, columns=pipeline_burst_type_onehot.get_feature_names_out())
X_train = pd.concat([X_train, original_categorical], axis=1)
X_train = X_train.astype(X_train_dtypes)



In [13]:
for activity in ACTIVITY_CATEGORIES:

    X_train['is_new_burst_start_' + activity] = (
        X_train['is_new_burst_start'] * 
        X_train['burst_type_' + activity]
        )

In [14]:
X_train = X_train[[
    "id",
    "event_id",
    "window_30s",
    "burst_id",
    "burst_type",
    "burst_type_Nonproduction",
    "burst_type_Input",
    "burst_type_Remove/Cut",
    "burst_type_Replace",
    "burst_type_Paste",
    "is_new_burst_start",
    "is_new_burst_start_Nonproduction",
    "is_new_burst_start_Input",
    "is_new_burst_start_Remove/Cut",
    "is_new_burst_start_Replace",
    "is_new_burst_start_Paste",
    "burst_time_start",
    "burst_time_end",
    "burst_duration",
    "word_count_delta_burst",

    "down_time",
    "up_time",	
    "action_time",	
    "activity",	
    "activity_Nonproduction",
    "activity_Input",
    "activity_Remove/Cut",
    "activity_Replace",
    "activity_Paste",
    "down_event",	
    "up_event",	
    "text_change",
    "is_thought_delimiting_punctuation",
    "cursor_position",	
    "word_count",

    "cursor_position_vs_max",
    "cursor_position_cummax",
    "has_cursor_position_moved_right",

    "word_count_lag1",
    "word_count_delta_event",

    "up_time_lag1",
    "preceding_pause_time",
    "preceding_pause_time_bucket",

    "burst_action_time_Nonproduction",
    "burst_action_time_Input",
    "burst_action_time_Remove/Cut",
    "burst_action_time_Replace",
    "burst_action_time_Paste"
    ]]


In [15]:
vars_sum = (
    ['activity_' + x for x in ACTIVITY_CATEGORIES] 
    + ['is_new_burst_start'] 
    + ['is_new_burst_start_' + x for x in ACTIVITY_CATEGORIES]
    + ['word_count_delta_event']
    + ["is_thought_delimiting_punctuation"]
    + ["preceding_pause_time"]
    )

X_train_marginals_sum_wrt_time = (
    X_train
    .groupby('id')
    [vars_sum]
    .agg(sum)
    )
X_train_marginals_sum_wrt_time['delete_insert_ratio'] = (
    X_train_marginals_sum_wrt_time['activity_Remove/Cut'] / 
    X_train_marginals_sum_wrt_time['activity_Input'] 
    )

In [16]:
X_train_marginals_central_tendency_wrt_time = (
    X_train
    .groupby('id')
    .agg(
        pause_time_p50 = ('preceding_pause_time', np.median),
        has_cursor_position_moved_right_mean = ('has_cursor_position_moved_right', 'mean'),
        burst_duration_mean = ('burst_duration', 'mean'),
        burst_duration_p50 = ('burst_duration', np.median),
        word_count_delta_burst_p50 = ('word_count_delta_burst', np.median),
        cursor_position_vs_max_avg = ('cursor_position_vs_max', 'mean')
        )
    )

In [17]:
X_train_marginals_extremes_wrt_time = (
    X_train
    .groupby('id')
    .agg(
        pause_time_max=('preceding_pause_time', 'max'),
        # approximation to, next longest pause after first long planning pause
        pause_time_p99=('preceding_pause_time', lambda x: x.quantile(0.99)),
        burst_duration_max=('burst_duration', 'max'),
        total_time=('up_time', 'max')
        )
    )

In [18]:
from scipy.stats import lognorm

pause_distr_summary_subjects = []

for X_train_subject in [x for _, x in X_train.groupby('id')]:

    shape, location, scale = lognorm.fit(X_train_subject['preceding_pause_time'].dropna())

    pause_distr_summary = pd.DataFrame({
        'pauses_lognorm_shape': [shape], 
        'pauses_lognorm_location': [location],
        'pauses_lognorm_scale': [scale]
        })
    pause_distr_summary.index = [X_train_subject['id'].iloc[0]]
    
    pause_distr_summary_subjects.append(pause_distr_summary)

X_train_marginals_distr_params_wrt_time = pd.concat(pause_distr_summary_subjects, axis=0)

In [19]:
X_train_marginals_wrt_time = pd.merge(
    X_train_marginals_sum_wrt_time, 
    X_train_marginals_central_tendency_wrt_time,
    how='left',
    left_index=True,
    right_index=True
    )

X_train_marginals_wrt_time = pd.merge(
    X_train_marginals_wrt_time, 
    X_train_marginals_extremes_wrt_time,
    how='left',
    left_index=True,
    right_index=True
    )

X_train_marginals_wrt_time = pd.merge(
    X_train_marginals_wrt_time, 
    X_train_marginals_distr_params_wrt_time,
    how='left',
    left_index=True,
    right_index=True
    )

In [20]:
for var in vars_sum:

    if var == 'preceding_pause_time':
        var_out = 'pause_time_fraction'
    else:
        var_out = var + '_per_s'

    X_train_marginals_wrt_time[var_out] = (
        (X_train_marginals_wrt_time[var] / X_train_marginals_wrt_time['total_time'])
        )
    
    if 'per_s' in var_out:
        X_train_marginals_wrt_time[var_out] *= 1000

X_train_marginals_wrt_time = (
    X_train_marginals_wrt_time
    .assign(
        keystroke_speed = lambda x: (x.activity_Input + x['activity_Remove/Cut']) / x.total_time,
        words_per_thought_delimiting_punctuation = lambda x: x.word_count_delta_event / x.is_thought_delimiting_punctuation,
        )
    )

In [21]:
X_train_marginals_wrt_time.head()

Unnamed: 0_level_0,activity_Nonproduction,activity_Input,activity_Remove/Cut,activity_Replace,activity_Paste,is_new_burst_start,is_new_burst_start_Nonproduction,is_new_burst_start_Input,is_new_burst_start_Remove/Cut,is_new_burst_start_Replace,...,is_new_burst_start_Nonproduction_per_s,is_new_burst_start_Input_per_s,is_new_burst_start_Remove/Cut_per_s,is_new_burst_start_Replace_per_s,is_new_burst_start_Paste_per_s,word_count_delta_event_per_s,is_thought_delimiting_punctuation_per_s,pause_time_fraction,keystroke_speed,words_per_thought_delimiting_punctuation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001519c8,120.0,2010.0,417.0,7.0,0.0,125,27.0,86.0,12.0,0.0,...,0.014984,0.047726,0.006659,0.0,0.0,0.141512,0.023863,0.720072,0.001347,5.930233
0022f953,254.0,1938.0,260.0,1.0,1.0,80,27.0,44.0,9.0,0.0,...,0.015092,0.024595,0.005031,0.0,0.0,0.178874,0.028508,0.695693,0.001229,6.27451
0042269b,175.0,3515.0,439.0,7.0,0.0,77,29.0,45.0,3.0,0.0,...,0.016369,0.0254,0.001693,0.0,0.0,0.228034,0.028222,0.684281,0.002232,8.08
0059420b,99.0,1304.0,151.0,1.0,1.0,87,7.0,77.0,3.0,0.0,...,0.004984,0.054825,0.002136,0.0,0.0,0.146675,0.011392,0.658491,0.001036,12.875
0075873a,72.0,1942.0,517.0,0.0,0.0,88,6.0,72.0,10.0,0.0,...,0.003609,0.043309,0.006015,0.0,0.0,0.151582,0.035489,0.657566,0.001479,4.271186


In [22]:
X_train_marginals_wrt_time.isnull().sum()

activity_Nonproduction                      0
activity_Input                              0
activity_Remove/Cut                         0
activity_Replace                            0
activity_Paste                              0
is_new_burst_start                          0
is_new_burst_start_Nonproduction            0
is_new_burst_start_Input                    0
is_new_burst_start_Remove/Cut               0
is_new_burst_start_Replace                  0
is_new_burst_start_Paste                    0
word_count_delta_event                      0
is_thought_delimiting_punctuation           0
preceding_pause_time                        0
delete_insert_ratio                         0
pause_time_p50                              0
has_cursor_position_moved_right_mean        0
burst_duration_mean                         0
burst_duration_p50                          0
word_count_delta_burst_p50                  0
cursor_position_vs_max_avg                  0
pause_time_max                    

In [24]:
X_train_by_window = (
    X_train
    .groupby(['id', 'window_30s'])
    [vars_sum + ['cursor_position_vs_max']]
    .agg(sum)
    .astype(float)
    .reset_index(drop=False)
    )

X_train_by_window['cursor_position_vs_max'] = (
    X_train_by_window['cursor_position_vs_max'] / 
    X_train_by_window[['activity_' + x for x in ACTIVITY_CATEGORIES]].sum(axis=1)
    )

X_train_by_window['delete_insert_ratio'] = (
    X_train_by_window['activity_Remove/Cut'] / 
    X_train_by_window['activity_Input'] 
    )

X_train_by_window['window_30s_idx'] = X_train_by_window.index

# for variability measure more comparable between writers,
# de-mean by writer. 
# Ex: higher-throughput writer incurs higher stddev, because values have higher abs value
time_rate_normalizers = [
    x
    for x in X_train_marginals_wrt_time.columns
    if 'per_s' in x 
    ]
# join method allows for merge on single index column
X_train_by_window = X_train_by_window.join(
    X_train_marginals_wrt_time[time_rate_normalizers],
    on='id',
    how='left'
)
for denom in time_rate_normalizers:
    level = denom.replace("_per_s", "")
    X_train_by_window[level] = (
        X_train_by_window[level] / 
        (X_train_by_window[denom].replace(0, None) * 30)
        )
    X_train_by_window[level] = X_train_by_window[level].fillna(1)
    
X_train_by_window = X_train_by_window.drop(columns=time_rate_normalizers)

X_train_by_window['preceding_pause_time'] = (
    X_train_by_window['preceding_pause_time'] / (1000 * 30)
    )

In [25]:
X_train_windows_variation = (
    X_train_by_window
    .drop(columns=['window_30s', 'window_30s_idx'])
    .groupby(['id'])
    .agg(np.std)
    )

X_train_windows_variation.columns = [
    x + "_stddev"
    for x in X_train_windows_variation.columns
    ]

In [26]:
X_train_windows_variation.head()

Unnamed: 0_level_0,activity_Nonproduction_stddev,activity_Input_stddev,activity_Remove/Cut_stddev,activity_Replace_stddev,activity_Paste_stddev,is_new_burst_start_stddev,is_new_burst_start_Nonproduction_stddev,is_new_burst_start_Input_stddev,is_new_burst_start_Remove/Cut_stddev,is_new_burst_start_Replace_stddev,is_new_burst_start_Paste_stddev,word_count_delta_event_stddev,is_thought_delimiting_punctuation_stddev,preceding_pause_time_stddev,cursor_position_vs_max_stddev,delete_insert_ratio_stddev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
001519c8,1.521284,0.867369,1.234386,3.564897,0.0,0.579248,1.802514,0.753851,2.019084,0.0,0.0,1.053084,1.72701,0.847402,355.252057,
0022f953,3.10932,1.238732,1.281989,7.698497,7.698497,0.906892,1.697435,1.244304,2.385853,0.0,0.0,1.385489,1.678964,0.926301,393.530886,0.699985
0042269b,1.040201,1.042226,1.347166,2.731169,0.0,0.999407,1.568079,1.070391,4.326498,0.0,0.0,1.495298,1.378203,0.966047,510.242834,
0059420b,2.38569,0.750662,1.299792,6.043872,6.043872,0.682283,2.490761,0.792268,3.429778,0.0,0.0,0.803069,1.411669,0.669502,85.719887,
0075873a,1.873528,0.995562,1.817107,0.0,0.0,0.815447,2.794169,0.893987,2.319142,0.0,0.0,1.506659,1.635917,0.728818,169.218938,


In [27]:
X_train_transform = pd.merge(
    X_train_marginals_wrt_time,
    X_train_windows_variation,
    how='left',
    left_index=True,
    right_index=True
    )

In [28]:
X_train_transform

Unnamed: 0_level_0,activity_Nonproduction,activity_Input,activity_Remove/Cut,activity_Replace,activity_Paste,is_new_burst_start,is_new_burst_start_Nonproduction,is_new_burst_start_Input,is_new_burst_start_Remove/Cut,is_new_burst_start_Replace,...,is_new_burst_start_Nonproduction_stddev,is_new_burst_start_Input_stddev,is_new_burst_start_Remove/Cut_stddev,is_new_burst_start_Replace_stddev,is_new_burst_start_Paste_stddev,word_count_delta_event_stddev,is_thought_delimiting_punctuation_stddev,preceding_pause_time_stddev,cursor_position_vs_max_stddev,delete_insert_ratio_stddev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001519c8,120.0,2010.0,417.0,7.0,0.0,125,27.0,86.0,12.0,0.0,...,1.802514,0.753851,2.019084,0.0,0.0,1.053084,1.727010,0.847402,355.252057,
0022f953,254.0,1938.0,260.0,1.0,1.0,80,27.0,44.0,9.0,0.0,...,1.697435,1.244304,2.385853,0.0,0.0,1.385489,1.678964,0.926301,393.530886,0.699985
0042269b,175.0,3515.0,439.0,7.0,0.0,77,29.0,45.0,3.0,0.0,...,1.568079,1.070391,4.326498,0.0,0.0,1.495298,1.378203,0.966047,510.242834,
0059420b,99.0,1304.0,151.0,1.0,1.0,87,7.0,77.0,3.0,0.0,...,2.490761,0.792268,3.429778,0.0,0.0,0.803069,1.411669,0.669502,85.719887,
0075873a,72.0,1942.0,517.0,0.0,0.0,88,6.0,72.0,10.0,0.0,...,2.794169,0.893987,2.319142,0.0,0.0,1.506659,1.635917,0.728818,169.218938,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffb8c745,189.0,3588.0,960.0,2.0,0.0,41,4.0,29.0,8.0,0.0,...,4.654245,1.492354,2.559097,0.0,0.0,5.717846,1.116658,0.918802,467.197641,
ffbef7e5,148.0,2395.0,60.0,1.0,0.0,90,24.0,65.0,1.0,0.0,...,2.263275,0.871037,7.742412,0.0,0.0,0.844542,0.976779,1.167803,255.216814,0.267295
ffccd6fd,126.0,2849.0,88.0,0.0,0.0,87,13.0,71.0,3.0,0.0,...,2.938466,1.004361,4.784855,0.0,0.0,1.130093,3.224691,0.806844,15.197101,0.135533
ffec5b38,71.0,2895.0,276.0,0.0,0.0,63,5.0,57.0,1.0,0.0,...,3.359326,0.993620,6.491568,0.0,0.0,0.828825,1.027915,0.816249,196.370511,


In [29]:
X_train_transform.isnull().mean()

activity_Nonproduction                      0.000000
activity_Input                              0.000000
activity_Remove/Cut                         0.000000
activity_Replace                            0.000000
activity_Paste                              0.000000
is_new_burst_start                          0.000000
is_new_burst_start_Nonproduction            0.000000
is_new_burst_start_Input                    0.000000
is_new_burst_start_Remove/Cut               0.000000
is_new_burst_start_Replace                  0.000000
is_new_burst_start_Paste                    0.000000
word_count_delta_event                      0.000000
is_thought_delimiting_punctuation           0.000000
preceding_pause_time                        0.000000
delete_insert_ratio                         0.000000
pause_time_p50                              0.000000
has_cursor_position_moved_right_mean        0.000000
burst_duration_mean                         0.000000
burst_duration_p50                          0.

In [None]:
CURSOR_POSITION_VS_MAX_STDDEV_P50 = 246.6

X_train_transform['cursor_position_vs_max_stddev'] = (
    X_train_transform['cursor_position_vs_max_stddev'].fillna(CURSOR_POSITION_VS_MAX_STDDEV_P50)
)

In [30]:
(
    X_train_transform
    .drop(columns='delete_insert_ratio_stddev')
    .to_pickle("./data/processed/X_train.pkl")
)