In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
import pickle

In [2]:
MS_PER_S = 1000
PATH_TRAIN_LOGS = "./data/external/train_logs.csv"

In [3]:
def extract(path):

    X = pd.read_csv(path)
    X = X.sort_values(["id", "event_id"], ascending=[True, True])
    
    return X

In [4]:
def scrub_activity(X):

    # 'Move From' activity recorded with low-level cursor loc details
    # extract bigger-picture 'Move From'
    # QUESTION: what's the difference between Move From, and a cut+paste?
    X['activity_detailed'] = X['activity']
    X.loc[X['activity'].str.contains('Move From'), 'activity'] = 'Move'

    return X

In [5]:
PAUSE_THRESHOLD_MS = 1000
N_ACTIVITIES_UNTIL_START_WINDOW_CLOSES = 100

def enrich_pauses(X):
    """Must infer pauses, as no explicit record indicates."""

    X['up_time_lag1'] = (
        X
        .groupby(['id'])
        ['up_time']
        .shift(1)
        )
    # latency does not mean a meaningful pause
    X['latency_time'] = (
        X['down_time'] - X['up_time_lag1']
        )

    X['preceding_pause_time'] = X['latency_time']
    # first record lacks preceding_pause_time: that's time before first key press
    X.loc[X['event_id'] == 1, 'preceding_pause_time'] = X['down_time']
    # expect some negative pause times -- interpret as, no real pause
    has_no_real_pause = X['preceding_pause_time'] <= PAUSE_THRESHOLD_MS
    X.loc[has_no_real_pause, 'preceding_pause_time'] = None
    # not obvious how to tag "initial planning pause" 
    # tried "first 5 minutes", but when that pause is 10 minutes, that fails.
    # first XX minutes is fragile
    # first XX events may help -- what's your extent of pause before *action*?
    X['preceding_pause_time_start_window'] = X['preceding_pause_time']
    X.loc[X['event_id'] <= N_ACTIVITIES_UNTIL_START_WINDOW_CLOSES, 'preceding_pause_time_start_window'] = None

    X['total_pause_time'] = (
        X
        .groupby(['id'])
        ['preceding_pause_time']
        .transform('sum')
        )
    X['rolling_pause_time'] = (
        X
        .groupby(['id'])
        ['preceding_pause_time']
        .cumsum()
        )
    X['rolling_pause_time_fraction'] = (
        X['rolling_pause_time'] / X['total_pause_time']
        )

    # summarize pause distr
    # MS_IN_PAUSE_BUCKET_MAX = 200e3
    # PAUSE_BUCKET_STEP_MS = 500
    # X['preceding_pause_time_bucket'] = pd.cut(
    #     X['preceding_pause_time'],
    #     bins=np.arange(
    #         0, 
    #         MS_IN_PAUSE_BUCKET_MAX,
    #         PAUSE_BUCKET_STEP_MS
    #         )
    #     )
    # X['preceding_pause_time_bucket'].value_counts()
    # WARNING: this representation of pause distribution is dense & large
    # a few parameters from distribution model far more succinct

    return X

In [6]:
# if pause exceeds threshold duration, a "burst" has ended
SECONDS_PER_BURST = 2

def enrich_time_bursts(X):

    X['is_new_burst_start'] = (
        X['preceding_pause_time'] > MS_PER_S * SECONDS_PER_BURST
        ).astype(int)
    X.loc[X['event_id'] == 1, 'is_new_burst_start'] = 1
    X['burst_id'] = (
        X
        .groupby(['id'])
        ['is_new_burst_start']
        .cumsum()
        )
    X['burst_time_start'] = (
        X
        .groupby(['id', 'burst_id'])
        ['down_time']
        .transform('min')
        )
    X['burst_time_end'] = (
        X
        .groupby(['id', 'burst_id'])
        ['up_time']
        .transform('max')
        )
    X['burst_time_duration'] = (
        X['burst_time_end'] - X['burst_time_start']
        )
    
    return X

In [7]:
def enrich_activity_streaks(X):
        
    # consecutive activity (independent of time) suggests productive writing flow

    X['activity_lag1'] = (
        X
        .groupby(['id'])
        ['activity']
        .shift(1)
        )

    X['is_new_activity_streak_start'] = (
        X['activity'] != X['activity_lag1']
    ).astype(int)
    X.loc[X['event_id'] == 1, 'is_new_activity_streak_start'] = 1

    X['is_activity_streak_end'] = (
        X
        .groupby(['id'])
        ['is_new_activity_streak_start']
        .shift(-1)
        )
    X['is_activity_streak_end'] = X['is_activity_streak_end'].fillna(1) 

    X['activity_streak_id'] = (
        X
        .groupby(['id'])
        ['is_new_activity_streak_start']
        .cumsum()
    )

    X['activity_streak_length_thin'] = (
        X
        .groupby(['id', 'activity_streak_id'])
        .transform('size')
    )
    X.loc[X['is_activity_streak_end'] == 0, 'activity_streak_length_thin'] = None

    return X

In [8]:
def enrich_word_count(X):

    # word count offers a productivity measure
    X['word_count_lag1'] = (
        X
        .groupby(['id'])
        ['word_count']
        .shift(1)
        )

    X['word_count_delta_event'] = (
        X['word_count'] - X['word_count_lag1']
        )

    X['word_count_delta_burst'] = (
        X
        .groupby(['id', 'burst_id'])
        ['word_count_delta_event']
        .transform('sum')
        )
    # de-duplicate to one value per burst -- easier for downstream aggregation
    X['word_count_delta_burst_thin'] = X['word_count_delta_burst']
    X.loc[X['is_new_burst_start'] == 0, 'word_count_delta_burst_thin'] = None

    return X

In [9]:
def enrich_cursor_position(X):

    # one-way cursor movement might be most productive
    # jumping around is choppy
    X['cursor_position_lag1'] = (
        X
        .groupby(['id'])
        ['cursor_position']
        .shift(1)
        )

    X['has_cursor_position_moved_right'] = (
        X['cursor_position'] > X['cursor_position_lag1']
        ).astype(int)

    # if cursor position increases due to copy+paste (perhaps of essay prompt),
    # that doesn't reflect grade-driving output
    X['cursor_position_input'] = np.where(
        X['activity'] == "Input", 
        X["cursor_position"], 
        np.nan
        )
    X['cursor_position_cummax'] = (
        X
        .groupby(['id'])
        ['cursor_position_input']
        .cummax()
        )
    # for some reason, unable to chain below statements with above
    X['cursor_position_cummax'] = (
        X
        .groupby(['id'])
        ['cursor_position_cummax']
        .ffill()
        .fillna(0)
    )

    X['cursor_position_vs_max'] = (
        X['cursor_position'] - X['cursor_position_cummax']
        )

    X['cursor_position_last_space'] = np.where(
        (X['activity'] == "Input") & (X["text_change"] == ' '),
        X['cursor_position'],
        np.nan
    ) 
    X['cursor_position_last_space'] = (
        X
        .groupby(['id'])
        ['cursor_position_last_space']
        .ffill()
        # likely not beginning essay with a space
        .fillna(0)
    )

    X = X.drop(columns='cursor_position_input')

    return X

In [10]:
def enrich_word_length(X):
        
    # word length offers a content quality measure.
    # hard to track entire words sequence in rolling fashion.
        # every word's length, in a list of one element per word?  
    # more tractable to track very latest string

    is_edit_to_latest_string = (
        X['cursor_position'] > X['cursor_position_last_space']
    )

    X['is_latest_space'] = (
        (X['cursor_position_vs_max'] == 0)
        & (X['activity'] == "Input")
        & (X["text_change"] == ' ')
        )

    X['is_latest_string_end'] = (
        X
        .groupby(['id'])
        ['is_latest_space']
        .shift(-1)
        # last process records
        .fillna(True)
        )

    X['n_alphanum_char_added_to_latest_string'] = 0
    is_alphanumeric_addition = (
        (X['activity'] == "Input")
        & (X["text_change"] == 'q')
        )
    X.loc[
        (is_alphanumeric_addition & is_edit_to_latest_string), 
        'n_alphanum_char_added_to_latest_string'
        ] = 1
    is_alphanumeric_subtraction = (
        (X['activity'] == "Remove/Cut")
        & (X['up_event'] == 'Backspace')
        & (X["text_change"] == 'q')
        )
    X.loc[
        (is_alphanumeric_subtraction & is_edit_to_latest_string), 
        'n_alphanum_char_added_to_latest_string'
        ] = -1

    # example: 2nd string, 2 characters in.
    # considering cumsum for each character in 2nd string, 
    # subtract those characters from 1st
    X['rolling_length_strings'] = (
        X
        .groupby(['id'])
        ['n_alphanum_char_added_to_latest_string']
        .cumsum() 
        ) 

    X['rolling_length_completed_strings'] = None
    X.loc[
        X['is_latest_space'], 'rolling_length_completed_strings'
        ] = X['rolling_length_strings']
    X['rolling_length_completed_strings'] = (
        X
        .groupby(['id'])
        ['rolling_length_completed_strings']
        .ffill()
        .fillna(0)
    )

    X['rolling_length_latest_string'] = (
        X['rolling_length_strings'] 
        - X['rolling_length_completed_strings']
    )

    X['length_latest_string'] = None
    X.loc[
        X['is_latest_string_end'], 'length_latest_string'
        ] = X['rolling_length_latest_string']
    
    return X

In [11]:
def enrich_punctuation(X):
        
    # if thoughts aren't separated by punctuation, writing won't score well
    X['is_thought_delimiting_punctuation'] = (
        (X['text_change'] == ".")
        | (X['text_change'] == ". ")
        | (X['text_change'] == ",")
        | (X['text_change'] == "-")
        | (X['text_change'] == "!")
        | (X['text_change'] == ";")
        | (X['text_change'] == "?")
        | (X['text_change'] == ":")
        ).astype(int)

    X['is_special_punctuation'] = (
        (X['text_change'] == "=")
        | (X['text_change'] == "/")
        | (X['text_change'] == "\\")
        | (X['text_change'] == "(")
        | (X['text_change'] == ")")
        | (X['text_change'] == "\n")
        | (X['text_change'] == "[")
        | (X['text_change'] == "]")
        | (X['text_change'] == ">")
        | (X['text_change'] == "<")
        | (X['text_change'] == "$")
        | (X['text_change'] == "*")
        | (X['text_change'] == "&")
    )

    return X

In [12]:
TOTAL_MIN_MAX_EXPECTED = 30
TOTAL_MIN_PLUS_BUFFER = 150 # id 21bbc3f6 case extended to 140 min ... odd
SECONDS_PER_MIN = 60
SECONDS_PER_WINDOW = 30

def enrich_time_windows(X):

    # windows allow for time-sequence features
    # expect that some essays extend beyond 30 min described in 'Data Collection'
    # downstream, **do not tabulate over a writer's unused time windows**!!

    X['window_30s'] = pd.cut(
        X['down_time'],
        bins=np.arange(
            0, 
            TOTAL_MIN_PLUS_BUFFER * SECONDS_PER_MIN * MS_PER_S, 
            SECONDS_PER_WINDOW * MS_PER_S
            )
        )

    X['is_time_beyond_expected_max'] = (
        X['up_time'] > TOTAL_MIN_MAX_EXPECTED * SECONDS_PER_MIN * MS_PER_S
    ).astype(int)

    return X

In [13]:
ACTIVITY_CATEGORIES = ['Nonproduction', 'Input', 'Remove/Cut', 'Replace', 'Paste', 'Move']

def transform_activity_onehot(X, is_training_run):

    if is_training_run:

        pipeline = ColumnTransformer(
            transformers=[(
                'onehot_encode', 
                preprocessing.OneHotEncoder(
                    categories=[ACTIVITY_CATEGORIES], 
                    sparse=False, 
                    handle_unknown='infrequent_if_exist'
                    ),
                ["activity"]
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )
        
        pipeline.fit(X)

        with open("pipeline_activity_onehot.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_activity_onehot.pkl", "rb") as f:
            pipeline = pickle.load(f)

    original_categorical = X['activity']

    X_dtypes = X.dtypes.to_dict()
    X = pipeline.transform(X)
    X = pd.DataFrame(X, columns=pipeline.get_feature_names_out())
    X = pd.concat([X, original_categorical], axis=1)
    X = X.astype(X_dtypes)

    return X

In [14]:
def enrich_burst_type(X):

    for activity in ACTIVITY_CATEGORIES:

        X['burst_events_' + activity] = (
            X
            .groupby(['id', 'burst_id'])
            ['activity_' + activity]
            .transform('sum')
            ).astype(float)
        
    X['burst_type'] = (
        X
        [['burst_events_' + activity for activity in ACTIVITY_CATEGORIES]]
        .idxmax(axis=1)
        )
    X['burst_type'] = (
        X['burst_type']
        .str
        .replace("burst_events_", "", regex=True)
        )
    
    return X

In [15]:
def transform_burst_type_onehot(X, is_training_run):

    if is_training_run:
        
        pipeline = ColumnTransformer(
            transformers=[(
                'onehot_encode', 
                preprocessing.OneHotEncoder(
                    categories=[ACTIVITY_CATEGORIES], 
                    sparse=False, 
                    handle_unknown='infrequent_if_exist'
                    ),
                ["burst_type"]
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )
        
        pipeline.fit(X)
        
        with open("pipeline_burst_type_onehot.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_burst_type_onehot.pkl", "rb") as f:
            pipeline = pickle.load(f)

    original_categorical = X['burst_type']
    X_dtypes = X.dtypes.to_dict()
    X = pipeline.transform(X)
    X = pd.DataFrame(X, columns=pipeline.get_feature_names_out())
    X = pd.concat([X, original_categorical], axis=1)
    X = X.astype(X_dtypes)

    for activity in ACTIVITY_CATEGORIES:

        X['is_new_burst_start_' + activity] = (
            X['is_new_burst_start'] * 
            X['burst_type_' + activity]
            )
        
        X['is_new_activity_streak_start_' + activity] = (
            X["activity_" + activity] * X['is_new_activity_streak_start']
        )

    return X

In [16]:
def subset_features(X):

    return X[[
        "id",
        "event_id",
        "is_time_beyond_expected_max",
        "window_30s",
        "burst_id",
        "burst_type",
        "burst_type_Nonproduction",
        "burst_type_Input",
        "burst_type_Remove/Cut",
        "burst_type_Replace",
        "burst_type_Paste",
        "burst_type_Move",
        "is_new_burst_start",
        "is_new_burst_start_Nonproduction",
        "is_new_burst_start_Input",
        "is_new_burst_start_Remove/Cut",
        "is_new_burst_start_Replace",
        "is_new_burst_start_Paste",
        "is_new_burst_start_Move",
        "burst_time_start",
        "burst_time_end",
        "burst_time_duration",
        "burst_events_Nonproduction",
        "burst_events_Input",
        "burst_events_Remove/Cut",
        "burst_events_Replace",
        "burst_events_Paste",
        "burst_events_Move",
        "word_count_delta_burst",
        "word_count_delta_burst_thin",
        "activity_streak_id",
        "is_new_activity_streak_start",
        "is_new_activity_streak_start_Nonproduction",
        "is_new_activity_streak_start_Input",
        "is_new_activity_streak_start_Remove/Cut",
        "is_new_activity_streak_start_Replace",
        "is_new_activity_streak_start_Paste",
        "is_new_activity_streak_start_Move",
        "is_activity_streak_end",
        "activity_streak_length_thin",

        "down_time",
        "up_time",	
        "action_time",	
        "activity_detailed",
        "activity",	
        "activity_Nonproduction",
        "activity_Input",
        "activity_Remove/Cut",
        "activity_Replace",
        "activity_Paste",
        "activity_Move",
        "down_event",	
        "up_event",	
        "text_change",
        "is_thought_delimiting_punctuation",
        "cursor_position",	
        "word_count",

        "cursor_position_vs_max",
        "cursor_position_cummax",
        "has_cursor_position_moved_right",
        "cursor_position_last_space",

        "is_latest_space",
        "is_latest_string_end",
        "n_alphanum_char_added_to_latest_string",
        "rolling_length_latest_string",
        "length_latest_string",

        "word_count_lag1",
        "word_count_delta_event",

        "up_time_lag1",
        "latency_time",
        "preceding_pause_time",
        "preceding_pause_time_start_window",
        "rolling_pause_time",
        "rolling_pause_time_fraction",
        "total_pause_time"
        ]]  

In [17]:
def enrich_logs(X, is_training_run):

#     PUNCTUATION = X_train.loc[(
#         (X_train['activity'] == 'Input')
#         & (~ X_train['text_change'].isin(['q', ' ']))
#         ), 'text_change'].unique()

    X = scrub_activity(X)

    X = enrich_pauses(X)

    X = enrich_time_bursts(X)

    X = enrich_activity_streaks(X)

    X = enrich_word_count(X)

    X = enrich_cursor_position(X)

    X = enrich_word_length(X)

    X = enrich_punctuation(X)

    X = enrich_time_windows(X)

    print("Proceeding to activity onehot encode.")
    X = transform_activity_onehot(X, is_training_run)
    print("Completed activity onehot encode.")

    X = enrich_burst_type(X)

    print("Proceeding to burst type onehot encode")
    X = transform_burst_type_onehot(X, is_training_run)
    print("Completed burst type onehot encode")

    return subset_features(X)


In [18]:
event_vars_sum = (
    ['activity_' + x for x in ACTIVITY_CATEGORIES] 
    + ['is_new_burst_start'] 
    + ['is_new_burst_start_' + x for x in ACTIVITY_CATEGORIES]
    + ["is_thought_delimiting_punctuation"]
    + ["is_new_activity_streak_start_" + x for x in ACTIVITY_CATEGORIES]
    )

conti_vars_sum = (
    ['word_count_delta_event']
    + ["preceding_pause_time"]
    )


def aggregate_no_time_dependence_measures(X):

    events_sum_over_time = (
        X
        .groupby('id')
        [event_vars_sum]
        .agg(sum)
        )

    events_sum_over_time['delete_insert_ratio'] = (
        events_sum_over_time['activity_Remove/Cut'] / 
        events_sum_over_time['activity_Input'] 
        )

    conti_sum_over_time = (
        X
        .groupby('id')
        [conti_vars_sum]
        .agg(sum)
        )

    sums_over_time = pd.merge(
        events_sum_over_time,
        conti_sum_over_time,
        how='left',
        left_index=True,
        right_index=True
    )


    centrals_over_time = (
        X
        .groupby('id')
        .agg(
            latency_time_p50 = ('latency_time', np.median),
            pause_time_p50 = ('preceding_pause_time', np.median),
            has_cursor_position_moved_right_mean = ('has_cursor_position_moved_right', 'mean'),
            word_count_delta_burst_mean = ('word_count_delta_burst_thin', 'mean'),
            word_count_delta_burst_p50 = ('word_count_delta_burst_thin', np.median),
            activity_streak_length_mean = ('activity_streak_length_thin', 'mean'),
            cursor_position_vs_max_avg = ('cursor_position_vs_max', 'mean'),
            length_latest_string_mean = ('length_latest_string', 'mean'),
            length_latest_string_stddev = ('length_latest_string', np.std)
            )
        )
    

    extremes_over_time = (
        X
        .groupby('id')
        .agg(
            pause_time_max=('preceding_pause_time', 'max'),
            initial_pause_time_max=('preceding_pause_time_start_window', 'max'),
            # approximation to, next longest pause after first long planning pause
            pause_time_p99=('preceding_pause_time', lambda x: x.quantile(0.99)),
            word_count_delta_burst_max=('word_count_delta_burst_thin', 'max'),
            activity_streak_length_max=('activity_streak_length_thin', 'max'),
            total_time=('up_time', 'max'),
            length_latest_string_max=('length_latest_string', 'max'),
            latency_time_min=('latency_time', 'min'),
            is_time_beyond_expected_max=('is_time_beyond_expected_max', 'max')
            )
        )

    extremes_over_time['is_initial_pause_max_pause'] = (
        extremes_over_time['pause_time_max'] == 
        extremes_over_time['initial_pause_time_max']
        ).astype(int)
    

    from scipy.stats import lognorm

    pause_distr_summary_subjects = []

    for X_subject in [x for _, x in X.groupby('id')]:

        shape, location, scale = lognorm.fit(X_subject['preceding_pause_time'].dropna())

        pause_distr_summary = pd.DataFrame({
            'pauses_lognorm_shape': [shape], 
            'pauses_lognorm_location': [location],
            'pauses_lognorm_scale': [scale]
            })
        pause_distr_summary.index = [X_subject['id'].iloc[0]]
        
        pause_distr_summary_subjects.append(pause_distr_summary)

    distr_params_over_time = pd.concat(pause_distr_summary_subjects, axis=0)


    aggregates_over_time = pd.merge(
        sums_over_time, 
        centrals_over_time,
        how='left',
        left_index=True,
        right_index=True
        )

    aggregates_over_time = pd.merge(
        aggregates_over_time, 
        extremes_over_time,
        how='left',
        left_index=True,
        right_index=True
        )

    aggregates_over_time = pd.merge(
        aggregates_over_time, 
        distr_params_over_time,
        how='left',
        left_index=True,
        right_index=True
        )
    

    for var in event_vars_sum:

        aggregates_over_time[var + '_per_s'] = (
            1000 * (aggregates_over_time[var] / aggregates_over_time['total_time'])
            )

    aggregates_over_time = (
        aggregates_over_time
        .assign(
            keystroke_speed = lambda x: (x.activity_Input + x['activity_Remove/Cut']) / x.total_time,
            words_per_thought_delimiting_punctuation = lambda x: x.word_count_delta_event / x.is_thought_delimiting_punctuation,
            )
        )
    
    
    return aggregates_over_time

In [19]:
def aggregate_time_variability_measures(aggregates_over_time, X):

    # per writer, by default, tabulate _every_ time window ever observed in data.
    # override: tabulate strictly until writer's final utilized time window.
    events_by_window = (
        X
        .groupby(['id', 'window_30s'])
        [event_vars_sum]
        .agg(sum)
        .astype(float)
        .fillna(0)
        .reset_index(drop=False)
        )
    events_by_window['has_activity'] = (
        events_by_window[['activity_' + x for x in ACTIVITY_CATEGORIES]].sum(axis=1) 
        > 0
    )
    events_by_window['idx_window_by_id'] = (
        events_by_window
        .groupby('id')
        .cumcount()
    )
    events_by_window['idx_has_activity'] = np.where(
        events_by_window['has_activity'], 
        events_by_window['idx_window_by_id'],
        np.nan
        )
    events_by_window['idx_activity_max'] = (
        events_by_window
        .groupby(['id'])
        ['idx_has_activity']
        .transform('max')
    )
    events_by_window = events_by_window.loc[
        events_by_window['idx_window_by_id'] <= events_by_window['idx_activity_max']
        ]
    events_by_window = events_by_window.drop(
        columns=['has_activity', 'idx_has_activity', 'idx_activity_max']
        )

    events_by_window['delete_insert_ratio'] = (
        events_by_window['activity_Remove/Cut'] / 
        events_by_window['activity_Input'] 
        ).replace(np.inf, np.nan)


    # for variability measure more comparable between writers, de-mean by writer. 
    # Ex: higher-throughput writer incurs higher stddev, because values have higher magnitude
    # join method allows for merge on one index column, of multiple possible
    events_by_window = events_by_window.join(
        aggregates_over_time[[x + '_per_s' for x in event_vars_sum]],
        on='id',
        how='left'
        )
    for var in event_vars_sum:
        events_by_window[var + '_time_norm'] = (
            events_by_window[var] / 
            (events_by_window[var + '_per_s'].replace(0, None) * 30)
            ).fillna(1)
    events_by_window = events_by_window.drop(columns=[x + '_per_s' for x in event_vars_sum])

    events_over_time_ren = aggregates_over_time[event_vars_sum]
    events_over_time_ren.columns = [x + "_total" for x in events_over_time_ren.columns]
    events_by_window = events_by_window.join(events_over_time_ren, on='id', how='left')
    for var in event_vars_sum:
        events_by_window[var + '_frac_total'] = (
            events_by_window[var] / (events_by_window[var + '_total'].replace(0, None))
            ).fillna(1)
    events_by_window = events_by_window.drop(columns=[x + '_total' for x in event_vars_sum])


    conti_by_window = (
        X
        .assign()
        .groupby(['id', 'window_30s'])
        [conti_vars_sum]
        .agg(sum)
        .astype(float)
        .fillna(0)
        .reset_index(drop=False)
        )
    conti_by_window['idx_window_by_id'] = (
        conti_by_window
        .groupby('id')
        .cumcount()
    )

    conti_over_time_ren = aggregates_over_time[conti_vars_sum]
    conti_over_time_ren.columns = [x + "_total" for x in conti_over_time_ren.columns]
    conti_by_window = conti_by_window.join(conti_over_time_ren, on='id', how='left')
    for var in conti_vars_sum:
        conti_by_window[var + '_frac_total'] = (
            conti_by_window[var] / conti_by_window[var + '_total']
            )
    conti_by_window = conti_by_window.drop(columns=[x + '_total' for x in conti_vars_sum])


    centrals_by_window = (
        X
        .groupby(['id', 'window_30s'])
        ['cursor_position_vs_max']
        .agg('mean')
        .astype(float)
        .reset_index(drop=False)
        )
    centrals_by_window['idx_window_by_id'] = (
        centrals_by_window
        .groupby('id')
        .cumcount()
    )


    aggregates_by_window = pd.merge(
        events_by_window, 
        conti_by_window,
        # events table reflects, writer's final utilized time window.
        # not all possible
        how='inner'
        )

    aggregates_by_window = pd.merge(
        aggregates_by_window, 
        centrals_by_window,
        how='left'
        )
    

    from scipy.stats import entropy

    entropy_vars = [var for var in aggregates_by_window.columns if 'frac_total' in var]
    entropy_by_window = (
        aggregates_by_window
        .groupby(['id'])
        [entropy_vars]
        .agg(lambda x: entropy(x.value_counts()))
        )
    entropy_by_window.columns = [
        x + '_entropy' 
        for x in entropy_by_window.columns
        ]


    sd_by_window = (
        aggregates_by_window
        .drop(columns=['window_30s', 'idx_window_by_id'])
        .groupby(['id'])
        .agg(np.std)
        )
    sd_by_window.columns = [
        x + "_stddev"
        for x in sd_by_window.columns
        ]


    trend_by_window = (
        aggregates_by_window
        .sort_values(['id', 'idx_window_by_id'])
        .drop(columns=['window_30s'])
        .groupby(['id'])
        .corr()
        )
    # extract correlations strictly with time index
    trend_by_window = trend_by_window.xs('idx_window_by_id', level=1)

    vars_drop = (
        [x for x in trend_by_window.columns if 'time_norm' in x]
        + [x for x in trend_by_window.columns if 'frac_total' in x]
        + ['idx_window_by_id']
        )
    trend_by_window = trend_by_window.drop(columns=vars_drop)

    trend_by_window.columns = [
        x + "_ttrend"
        for x in trend_by_window.columns
        ]

    trend_by_window = trend_by_window.fillna(0)


    vari_by_window = pd.merge(
        entropy_by_window,
        sd_by_window,
        how='left',
        left_index=True,
        right_index=True
        )   

    vari_by_window = pd.merge(
        vari_by_window,
        trend_by_window,
        how='left',
        left_index=True,
        right_index=True
        )     
    
    
    return vari_by_window

In [20]:
def feature_transform_pipeline(X_logs, is_training_run):

    X_logs_enriched = enrich_logs(X_logs, is_training_run)

    aggregates_over_time = aggregate_no_time_dependence_measures(X_logs_enriched)
    vari_by_window = aggregate_time_variability_measures(
        aggregates_over_time, X_logs_enriched
        )

    X_transform = pd.merge(
        aggregates_over_time,
        vari_by_window,
        how='left',
        left_index=True,
        right_index=True
        )
    
    return X_transform

In [None]:
# expect train_logs are too large for single batch processing
X_train_logs = extract(PATH_TRAIN_LOGS)

X_train_logs_groups = [x for _, x in X_train_logs.groupby('id')]
del X_train_logs

X_train_logs_chunk1 = X_train_logs_groups[0:1200]
X_train_logs_chunk2 = X_train_logs_groups[1200:]
del X_train_logs_groups

X_train_logs_chunk1 = pd.concat(X_train_logs_chunk1, axis=0)
X_train_logs_chunk2 = pd.concat(X_train_logs_chunk2, axis=0).reset_index(drop=True)


In [None]:
X_train_chunk1 = feature_transform_pipeline(X_train_logs_chunk1, True)
del X_train_logs_chunk1

In [None]:
X_train_chunk2 = feature_transform_pipeline(X_train_logs_chunk2, True)
del X_train_logs_chunk2

In [None]:
X_train = pd.concat([X_train_chunk1, X_train_chunk2], axis=0)
del X_train_chunk1, X_train_chunk2

In [None]:
X_train.to_pickle("./data/processed/X_train.pkl")

In [None]:
# (
#     X_train
#     .loc[X_train['id'].isin(['b732c6e2', 'b73648cf'])]
#     .to_csv("./data/X_train_enriched_cases.csv", index=False)
# )

In [None]:
# X_train = pd.read_pickle("./data/processed/train_logs_enriched.pkl")

In [None]:
# (
#     X_train
#     .loc[X_train['id'].isin(['b73648cf'])]
#     .to_csv("./data/X_train_enriched_case.csv", index=False)
# )

In [21]:
X_train_logs = extract(PATH_TRAIN_LOGS)

In [23]:
X_train_logs_sub = X_train_logs.loc[X_train_logs['id'] == '001519c8'].reset_index(drop=True)

In [24]:
X_train_logs_sub

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
...,...,...,...,...,...,...,...,...,...,...,...
2552,001519c8,2553,1781786,1781841,55,Remove/Cut,Backspace,Backspace,q,555,255
2553,001519c8,2554,1781917,1781991,74,Remove/Cut,Backspace,Backspace,q,554,255
2554,001519c8,2555,1782062,1782141,79,Remove/Cut,Backspace,Backspace,q,553,255
2555,001519c8,2556,1782922,1782985,63,Input,q,q,q,554,255


In [31]:
def getEssays(df):
    textInputDf = df[['id', 'activity', 'cursor_position', 'text_change']]
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
    valCountsArr = textInputDf['id'].value_counts(sort=False).values
    lastIndex = 0
    essaySeries = pd.Series()
    for index, valCount in enumerate(valCountsArr):
        currTextInput = textInputDf[['activity', 'cursor_position', 'text_change']].iloc[lastIndex : lastIndex + valCount]
        lastIndex += valCount
        essayText = ""
        for Input in currTextInput.values:
            if Input[0] == 'Replace':
                replaceTxt = Input[2].split(' => ')
                essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] +\
                essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                continue
            if Input[0] == 'Paste':
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                continue
            if Input[0] == 'Remove/Cut':
                essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                continue
            if "M" in Input[0]:
                croppedTxt = Input[0][10:]
                splitTxt = croppedTxt.split(' To ')
                valueArr = [item.split(', ') for item in splitTxt]
                moveData = (int(valueArr[0][0][1:]), 
                            int(valueArr[0][1][:-1]), 
                            int(valueArr[1][0][1:]), 
                            int(valueArr[1][1][:-1]))
                if moveData[0] != moveData[2]:
                    if moveData[0] < moveData[2]:
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
                        essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    else:
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
                        essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                continue
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
        essaySeries[index] = essayText
    essaySeries.index =  textInputDf['id'].unique()
    return pd.DataFrame(essaySeries, columns=['essay'])

In [44]:
def concatenate_essays(df):
    textInputDf = df[['id', 'activity', 'cursor_position', 'text_change']]
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
    valCountsArr = textInputDf['id'].value_counts(sort=False).values
    lastIndex = 0
    essaySeries = pd.Series()
    for index, valCount in enumerate(valCountsArr):
        currTextInput = textInputDf[['activity', 'cursor_position', 'text_change']].iloc[lastIndex : lastIndex + valCount]
        lastIndex += valCount
        essayText = ""
        for Input in currTextInput.values:

            cursor_position_after_event = Input[1]


            if Input[0] == 'Replace':

                replaceTxt = Input[2].split(' => ')
                text_add = replaceTxt[1]
                text_remove = replaceTxt[0]
                cursor_position_start_text_change = (
                    cursor_position_after_event - len(text_add)
                    )
                cursor_position_after_skip_replace = (
                    cursor_position_start_text_change + len(text_remove)
                )

                # essayText start: "the blue cat"
                # replace "blue" with "red"
                # "the redblue cat", skip blue
                essayText = (
                    essayText[:cursor_position_start_text_change] # "the "
                    + text_add # "red"
                    # essayText value: "the blue cat" 
                    # want remaining " cat", NOT "blue cat"
                    + essayText[cursor_position_after_skip_replace:] 
                    )

                continue

            if Input[0] == 'Paste':

                text_change_event = Input[2]
                cursor_position_start_text_change = (
                    cursor_position_after_event - len(text_change_event)
                    )

                # essayText start: "the cat"
                # paste "blue " between
                essayText = (
                    essayText[:cursor_position_start_text_change] # "the " 
                    + text_change_event # "blue "
                    # essayText value: "the cat"
                    + essayText[cursor_position_start_text_change:]
                )

                continue

            if Input[0] == 'Remove/Cut':
                # similar process to "Replace" action

                text_remove = Input[2]
                cursor_position_after_skip_remove = (
                    cursor_position_after_event + len(text_remove)
                )

                essayText = (
                    essayText[:cursor_position_after_event] 
                    + essayText[cursor_position_after_skip_remove:]
                    )

                continue
            
            if "Move" in Input[0]:

                location_vectors_raw_str = (
                    Input[0][10:]
                    .replace("[", "")
                    .replace("]", "")
                    )
                location_vectors = location_vectors_raw_str.split(' To ')
                location_vector_from = 
                valueArr = [item.split(', ') for item in splitTxt]
                moveData = (
                    int(valueArr[0][0][1:]), 
                    int(valueArr[0][1][:-1]), 
                    int(valueArr[1][0][1:]), 
                    int(valueArr[1][1][:-1])
                    )
                if moveData[0] != moveData[2]:
                    if moveData[0] < moveData[2]:
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
                        essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    else:
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
                        essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                        
                continue
            

            text_change_event = Input[2]
            cursor_position_start_text_change = (
                cursor_position_after_event - len(text_change_event)
                )
            essayText = (
                essayText[:cursor_position_start_text_change] 
                + text_change_event
                + essayText[cursor_position_start_text_change:]
                )
        
        essaySeries[index] = essayText
    essaySeries.index =  textInputDf['id'].unique()
    return pd.DataFrame(essaySeries, columns=['essay'])

SyntaxError: invalid syntax (2363574567.py, line 36)

In [61]:
activity = "Move From [284, 292] To [282, 290]"
location_vectors_raw_str = (
    activity[10:]
    .replace("[", "")
    .replace("]", "")
    )
location_vectors = location_vectors_raw_str.split(' To ')
valueArr = [item.split(', ') for item in splitTxt]
# moveData = (
#     int(valueArr[0][0][1:]), 
#     int(valueArr[0][1][:-1]), 
#     int(valueArr[1][0][1:]), 
#     int(valueArr[1][1][:-1])
#     )
# moveData = (
#     int(valueArr[0][0][1:]), 
#     int(valueArr[0][1][:-1]), 
#     int(valueArr[1][0][1:]), 
#     int(valueArr[1][1][:-1])
#     )
# moveData = valueArr[0] + valueArr[1]
# moveData = [int(x) for x in moveData]

In [62]:
location_vectors_raw_str, location_vectors

('284, 292 To 282, 290', ['284, 292', '282, 290'])

qqq q
|T|h|e| |c

The cat

In [42]:
import textwrap
text = getEssays(X_train_logs_sub).iat[0, 0]
text_wrapped = textwrap.wrap(text) 
print( text )

qqqqqqqqq qq qqqqq qq qqqq qqqq.  qqqqqq qqq qqqq qqqqqq qq qq qqqqq qq qqqq qqqqq qq qqqqqqqqq qqqqq qqqq qqqqq qqq qqqqqqqqq qqqqqqqqq qqqq.  qqqqqq qqq qqqqq qqq qqqqqqqqqqq qq qqq qqqqqqqqqq qqqqq, qqq qqqqq qqqqqq qq qq qqqq qqq qqqqqq qqqqqqq qq qqq qqqqqqqqqqq.  qqqqqqqq qq qqqqqqqqqq qqqq qqqq qqqqqqqqq qqq qqqqqqq qq qqqqqq qqqq qqq qqq qq qqqqqqqqq qq qq qqq qqqqq qqqqq qq qqq.

qq qq qqqq qqqq qqq qqqqqqqqq qqq qqqqqqq qq qqq qqqqq qqqqq, qq qq qqqqqq qqq qqq qqqqqqqq qqqqq qq qqq qqqqqqqqqqq qq qqqqqqqqq.  qqqqqqqqq qq qqq qqqqqqqq qqqq qq qqqq qq qqqqqqq qqqqq qqqqq, qqq qqqqqq qqqqq qqqqq qqq qqq qq qqq qqqqqqq qqqqqqq qqqq.  qqqq qqqqq qqqqq qqqq qqqq'qq qqqqq qqqqqqqqq qqqqq qqqqqqq qqqqqqq qqqqqqqqqq, qqqq qq qqqqqqqqqq qqqqqqq qqq qqqqqqq; qqqqqqq, qqqqq qqqqqqqq qqqqqq qqqqqqq qqqqqqq qqq qqqqq qqq qqq qqq qqqqqqq.  qqqq qqqqqqqqq qqqq qqq qqqq qqqq qqqqq qqqqqqqqqq qqqq qqqqq qqqqq.  qqq qqqqqqqqqq qq qqqqqqqq q qqqqqq, qqqqqqqq qqqq qqqq qqqqqqqqqq, qqq. qq qqqqq q

In [28]:
for x in X_train_logs_sub.iloc[:2, ].values:
    print(x)

['001519c8' 1 4526 4557 31 'Nonproduction' 'Leftclick' 'Leftclick'
 'NoChange' 0 0]
['001519c8' 2 4558 4962 404 'Nonproduction' 'Leftclick' 'Leftclick'
 'NoChange' 0 0]


In [30]:
X_train_logs_sub[['activity', 'cursor_position', 'text_change']].query("activity == 'Replace'")

Unnamed: 0,activity,cursor_position,text_change
462,Replace,161,qqq qqqqq =>
468,Replace,178,qqqqq qq => q
512,Replace,235,qqqq => q
2364,Replace,283,qqqqqqq qqqqqq => q
2454,Replace,1298,qq qqqqqqqq qqqqq => q
2474,Replace,1301,qqqqq => q
2481,Replace,1448,qqqqqqqqq => q


In [43]:
X_train_logs_sub.to_csv("./data/essay_reconstruct_case.csv", index=False)

In [45]:
X_train_logs.query("activity=='Paste'")

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
4797,0022f953,2241,1017354,1017425,71,Paste,Space,Space,\n,696,316
9365,0059420b,219,285459,285619,160,Paste,v,v,qqqqqqqqqqq,95,23
41276,00fc9a6a,1601,737215,737552,337,Paste,v,v,qqqqqq qqqqqqq qqqqqq qqq qq qqqqqqqqqq qqq qq...,78,153
41987,00fc9a6a,2312,1129516,1129835,319,Paste,z,z,"qq qqqqqq qqqq,",980,230
46948,014e7ae9,2455,828133,828256,123,Paste,v,v,qqq qqqqqq qqq qqqq qqqqqqqqqqq qq qqq qqqqq q...,1601,280
...,...,...,...,...,...,...,...,...,...,...,...
8372799,ff635a91,309,135266,135398,132,Paste,v,v,qqqqqqqqq qqqqqqq qqq qqqqqqqqq qqqq,278,44
8374400,ff635a91,1910,1014489,1014642,153,Paste,v,v,qqqqqqqqqqq,1443,248
8374465,ff635a91,1975,1031024,1031142,118,Paste,v,v,qqqqqqqqqqq,1490,255
8374710,ff635a91,2220,1335273,1335382,109,Paste,Space,Space,qqqqqqqqqqq,660,287


In [47]:
X_train_logs.loc[X_train_logs['activity'].str.contains('Move')].head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
2359,001519c8,2360,1380334,1380334,0,"Move From [284, 292] To [282, 290]",Leftclick,Leftclick,qqqqqqq,290,244
2362,001519c8,2363,1382896,1382896,0,"Move From [287, 289] To [285, 287]",Leftclick,Leftclick,qq,287,244
2515,001519c8,2516,1735021,1735021,0,"Move From [460, 461] To [465, 466]",Leftclick,Leftclick,q,466,256
263545,07bb2245,2374,1282273,1282273,0,"Move From [905, 1314] To [907, 1316]",Leftclick,Leftclick,qqqqq qqqq qqqqqqq qqqq qqqq qqqqqq qqqqqqqq ...,1316,275
263572,07bb2245,2401,1650084,1650084,0,"Move From [565, 743] To [669, 847]",Leftclick,Leftclick,qqqq qqqqqq q qqqq qq qqqqq qqqq qqqq qqqqqqq....,847,275
