In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
import pickle
import re
from scipy.stats import lognorm, skew, kurtosis, entropy
import json

In [None]:
MS_PER_S = 1000
PATH_TRAIN_LOGS = "./data/external/train_logs.csv"
PATH_TRAIN_OUTCOMES = "./data/external/train_scores.csv"

In [None]:
ACTIVITY_CATEGORIES = ['Nonproduction', 'Input', 'Remove/Cut', 'Replace', 'Paste', 'Move']

In [None]:
# might obtain this file via:
# json.dump(pipeline.vocabulary_, open('text_vectorizer_vocabulary.txt', 'w'))

# PRETRAINED_TEXT_VOCABULARY = None # if no pre-trained file
PRETRAINED_TEXT_VOCABULARY = json.load(open('text_vectorizer_vocabulary.txt'))

In [None]:
# json.dump(pipeline.vocabulary_, open('events_vectorizer_vocabulary.txt', 'w'))

# PRETRAINED_EVENTS_VOCABULARY = None # if no pre-trained file
PRETRAINED_EVENTS_VOCABULARY = json.load(open('events_vectorizer_vocabulary.txt'))

In [None]:
# with large vectorizer dictionaries, expedient to train offline and deploy
# FEATURES_PRESELECTED = None
FEATURES_PRESELECTED = [
    'n_characters',
    'activity_Input_per_s',
    'word_count_delta',
    'latency_time_1.0',
    'word_count_delta_per_s',
    'n_thought_delimiting_punctuation',
    'vocab12047',
    'n_commas',
    'cursor_position_vs_max_4.0',
    'cursor_position_delta_2.0',
    'keystroke_speed',
    'vocab0',
    'latency_time_mean',
    'vocab10215',
    'activity_Input',
    'latency_time_3.0',
    'word_count_delta_burst_thin_1.0',
    'activity_streak_length_thin_0.0',
    'preceding_pause_time_0.0',
    'vocab15304',
    'word_count_delta_activity_streak_thin_1.0',
    'vocab13729',
    'preceding_pause_time_p50',
    'event46',
    'latency_time_4.0',
    'vocab16789',
    'latency_time_p50',
    'word_count_delta_frac_total_entropy',
    'n_sentences',
    'vocab8374',
    'vocab944',
    'vocab4322',
    'vocab2169',
    'vocab6413',
    'vocab1449',
    'is_new_activity_streak_start_Nonproduction',
    'word_count_delta_activity_streak_thin_0.0',
    'vocab8375',
    'is_new_activity_streak_start_Nonproduction_frac_total_entropy',
    'words_length_geq8_lt25_frac',
    'latency_time_0.0',
    'preceding_pause_time_lognorm_scale',
    'vocab575',
    'n_sentences_words_geq25_lt30',
    'vocab1122',
    'latency_time_5.0',
    'vocab10216',
    'vocab6414',
    'is_new_activity_streak_start_Nonproduction_per_s',
    'word_count_delta_burst_thin_mean',
    'vocab16956',
    'cursor_position_vs_max_1.0',
    'vocab11230',
    'n_dashes',
    'n_sentences_words_geq30_lt50',
    'vocab1602',
    'latency_time_lognorm_scale',
    'vocab761',
    'is_new_activity_streak_start_Input_per_s',
    'vocab14064',
    'is_new_activity_streak_start_Input',
    'latency_time_6.0',
    'latency_time_2.0',
    'cursor_position_vs_max_3.0',
    'event9',
    'vocab12048',
    'vocab4525',
    'vocab6608',
    'words_length_mean',
    'vocab10918',
    'vocab6974',
    'activity_Remove/Cut',
    'vocab2540',
    'vocab4323',
    'latency_time',
    'vocab18206',
    'vocab5264',
    'vocab13901',
    'vocab3272',
    'words_length_stddev',
    'n_sentences_words_geq15_lt20',
    'word_count_delta_burst_thin_stddev',
    'vocab2916',
    'n_sentences_words_geq20_lt25',
    'vocab17132',
    'vocab5436',
    'vocab10400',
    'is_new_activity_streak_start_Remove/Cut',
    'n_sentences_words_geq5_lt10_frac',
    'vocab1291',
    'vocab5093',
    'activity_Input_frac_total_entropy',
    'vocab186',
    'is_new_activity_streak_start_Input_frac_total_entropy',
    'cursor_position_delta_1.0',
    'cursor_position_vs_max_2.0',
    'vocab10755',
    'vocab8565',
    'event18',
    'vocab10578',
    'vocab2170',
    'vocab15467',
    'vocab4716',
    'vocab8937',
    'vocab12867',
    'vocab11074',
    'n_apostrophe',
    'cursor_position_delta_stddev',
    'n_sentences_words_geq25_lt30_frac',
    'i_words_by_sentence_stddev',
    'pause_time_fraction',
    'is_new_activity_streak_start_Remove/Cut_per_s',
    'i_words_by_sentence_p50',
    'activity_Remove/Cut_per_s',
    'activity_Replace_frac_total_entropy',
    'n_sentences_words_geq30_lt50_frac',
    'vocab7144',
    'vocab19262',
    'i_words_by_sentence_mean',
    'vocab2732',
    'vocab6797',
    'activity_Nonproduction_frac_total_entropy',
    'vocab7299',
    'cursor_position_vs_max_stddev',
    'vocab383',
    'words_per_thought_delimiting_punctuation_avg',
    'words_length_geq4_lt5_frac',
    'vocab3605',
    'vocab16790',
    'activity_Remove/Cut_frac_total_entropy',
    'vocab14229',
    'vocab12407',
    'vocab15305',
    'vocab13730',
    'vocab2352',
    'vocab3440',
    'vocab12231',
    'vocab7608',
    'n_paragraphs_with_n_sentences_geq0_lt2_frac',
    'event45',
    'vocab5897',
    'vocab8751',
    'n_questions',
    'cursor_position_vs_max_mean',
    'latency_time_stddev',
    'vocab1',
    'n_sentences_words_geq20_lt25_frac',
    'words_length_geq3_lt4_frac',
    'vocab10579',
    'preceding_pause_time',
    'n_paragraphs',
    'vocab12727',
    'preceding_pause_time_lognorm_shape',
    'vocab9259',
    'word_count_delta_burst_thin_p50',
    'vocab3092',
    'word_count_delta_activity_streak_thin_mean',
    'latency_time_9.0',
    'vocab576',
    'words_length_geq0_lt2_frac',
    'vocab11370',
    'vocab15797',
    'vocab384',
    'cursor_position_vs_max_0.0',
    'n_sentences_words_geq15_lt20_frac',
    'activity_Nonproduction_per_s',
    'is_new_activity_streak_start_Replace_frac_total_entropy',
    'latency_time_lognorm_shape',
    'vocab3765',
    'vocab14372',
    'activity_streak_length_thin_mean',
    'vocab3093',
    'activity_Replace_per_s',
    'is_new_activity_streak_start_Remove/Cut_frac_total_entropy',
    'preceding_pause_time_mean',
    'vocab5598',
    'vocab2917',
    'vocab762',
    'vocab17410',
    'vocab11630',
    'n_sentences_words_geq10_lt15_frac',
    'vocab14503',
    'activity_streak_length_thin_stddev',
    'vocab16184',
    'words_length_geq7_lt8_frac',
    'vocab16057',
    'is_new_burst_start_Input_ttrend',
    'vocab15647',
    'event30',
    'n_paragraphs_with_n_sentences_geq0_lt2',
    'n_quotes',
    'is_new_activity_streak_start_Replace_per_s',
    'vocab5752',
    'latency_time_8.0',
    'vocab974',
    'vocab2733',
    'vocab6490',
    'activity_Remove/Cut_ttrend',
    'vocab945',
    'vocab12263',
    'latency_time_ttrend',
    'vocab7893',
    'preceding_pause_time_lognorm_location',
    'vocab6798',
    'vocab4911',
    'is_new_activity_streak_start_Replace',
    'delete_insert_ratio',
    'vocab2541',
    'vocab7457',
    'vocab218',
    'preceding_pause_time_ttrend',
    'activity_Replace',
    'is_new_activity_streak_start_Remove/Cut_ttrend',
    'activity_Input_ttrend',
    'vocab13390',
    'vocab2383',
    'is_new_burst_start_ttrend',
    'words_length_geq2_lt3_frac',
    'vocab8580',
    'vocab235',
    'latency_time_frac_total_entropy',
    'is_new_burst_start_Nonproduction_ttrend',
    'vocab1123',
    'is_new_activity_streak_start_Nonproduction_ttrend',
    'word_count_delta_activity_streak_thin_stddev',
    'activity_Nonproduction',
    'vocab202',
    'vocab4368',
    'vocab6444',
    'vocab15934',
    'words_length_geq6_lt7_frac',
    'vocab9100',
    'vocab17534',
    'vocab2617',
    'preceding_pause_time_frac_total_entropy',
    'vocab13146',
    'is_new_burst_start_Nonproduction_per_s',
    'vocab12063',
    'is_new_activity_streak_start_Input_ttrend',
    'is_new_burst_start_Nonproduction_frac_total_entropy',
    'is_new_burst_start_Remove/Cut_per_s',
    'cursor_position_delta_mean',
    'words_length_geq5_lt6_frac',
    'vocab5094',
    'vocab7752',
    'vocab4415',
    'vocab4031',
    'vocab6429',
    'vocab8450',
    'vocab6415',
    'total_time',
    'vocab960',
    'vocab12568',
    'vocab279',
    'preceding_pause_time_stddev',
    'vocab13275',
    'is_new_burst_start',
    'vocab9559',
    'vocab10261',
    'vocab8465',
    'latency_time_7.0',
    'is_new_burst_start_Nonproduction',
    'vocab591',
    'is_new_burst_start_frac_total_entropy',
    'activity_streak_length_thin_p50',
    'word_count_delta_ttrend',
    'is_new_burst_start_per_s',
    'is_new_burst_start_Remove/Cut_frac_total_entropy',
    'cursor_position_delta_0.0',
    'vocab2762',
    'vocab3273',
    'n_sentences_words_geq50_lt5000_frac',
    'latency_time_lognorm_location',
    'vocab4927',
    'vocab5123',
    'is_new_burst_start_Input_per_s',
    'preceding_pause_time_max',
    'vocab635',
    'vocab2572',
    'initial_pause_time_max',
    'vocab4603',
    'is_new_burst_start_Input',
    'vocab11505',
    'vocab4782',
    'vocab47',
    'vocab13016',
    'vocab16820',
    'vocab4796',
    'vocab9419',
    'vocab8406',
    'vocab16972',
    'vocab1464',
    'vocab2587',
    'vocab8421',
    'vocab10275',
    'vocab250',
    'vocab416',
    'vocab4912',
    'vocab119',
    'word_count_delta_burst_thin_0.0',
    'vocab4526',
    'n_sentences_words_geq10_lt15',
    'vocab6842',
    'vocab76',
    'n_paragraphs_with_n_sentences_geq7_lt10_frac',
    'vocab794',
    'n_sentences_words_geq5_lt10',
    'vocab989',
    'vocab2201',
    'activity_Nonproduction_ttrend',
    'vocab187',
    'vocab1450',
    'vocab6639',
    'vocab32',
    'vocab6683',
    'vocab10919',
    'vocab5295',
    'is_new_burst_start_Remove/Cut_ttrend',
    'vocab16303',
    'vocab1321',
    'vocab11231',
    'vocab4385',
    'vocab9833',
    'vocab4750',
    'vocab6975',
    'is_new_burst_start_Input_frac_total_entropy',
    'vocab4401',
    'vocab10401',
    'vocab13760',
    'vocab4955',
    'vocab6813',
    'vocab2747',
    'vocab778',
    'vocab3289',
    'n_paragraphs_with_n_sentences_geq3_lt4_frac',
    'vocab8752',
    'vocab10231',
    'vocab2398',
    'vocab8435',
    'vocab6625',
    'vocab837',
    'vocab8391',
    'vocab5108',
    'vocab8610',
    'vocab10608',
    'vocab7145',
    'n_paragraphs_with_n_sentences_geq5_lt6_frac',
    'vocab17',
    'vocab8479',
    'vocab15333',
    'vocab1350',
    'vocab651',
    'is_new_activity_streak_start_Replace_ttrend',
    'vocab4717',
    'vocab4324',
    'vocab3123',
    'vocab1292',
    'vocab91',
    'vocab6609',
    'vocab606',
    'vocab4430',
    'vocab14768',
    'vocab2288',
    'vocab3913',
    'vocab4353',
    'n_paragraphs_with_n_sentences_geq7_lt10',
    'vocab12107',
    'vocab1603',
    'vocab4339',
    'vocab1017',
    'vocab17278',
    'vocab5265',
    'vocab4574',
    'vocab14646',
    'vocab6536',
    'vocab18207',
    'vocab8566',
    'vocab1617',
    'vocab4617',
    'vocab808',
    'event11',
    'event4',
    'vocab6989',
    'vocab8768',
    'n_paragraphs_with_n_sentences_geq6_lt7_frac',
    'vocab4542',
    'vocab15648',
    'activity_Replace_ttrend',
    'vocab11384',
    'vocab6475',
    'vocab666',
    'vocab2647',
    'vocab4588',
    'vocab7159',
    'vocab4734',
    'vocab8376',
    'vocab9942',
    'is_new_burst_start_Remove/Cut',
    'vocab2777',
    'vocab3441',
    'vocab2216',
    'vocab1873',
    'vocab2171',
    'vocab620',
    'vocab433',
    'vocab15499',
    'vocab8595',
    'vocab1184',
    'vocab10756',
    'vocab10446',
    'vocab10289',
    'vocab2560',
    'vocab20263',
    'vocab264',
    'vocab2557',
    'vocab15662',
    'vocab2602',
    'vocab5912',
    'vocab4557',
    'vocab17001',
    'vocab9274',
    'vocab3766',
    'vocab5166',
    'vocab2412',
    'vocab14997',
    'vocab477',
    'n_paragraphs_with_n_sentences_geq4_lt5_frac',
    'vocab5898',
    'vocab7609',
    'vocab448',
    'vocab7174',
    'vocab401',
    'vocab10317',
    'vocab1168',
    'vocab1138',
    'vocab13902',
    'vocab10772',
    'vocab504',
    'vocab10217',
    'vocab10933',
    'vocab12093',
    'vocab18345',
    'vocab12569',
    'vocab4573',
    'vocab6041',
    'vocab13746',
    'n_paragraphs_with_n_sentences_geq5_lt6',
    'vocab2368',
    'vocab8798',
    'vocab3796',
    'vocab17016',
    'event47',
    'vocab10246',
    'n_paragraphs_with_n_sentences_geq3_lt4',
    'vocab2933',
    'vocab16847',
    'vocab6827',
    'vocab105',
    'vocab2',
    'vocab14890',
    'vocab9260',
    'vocab2807',
    'vocab3500',
    'vocab9697',
    'vocab308',
    'vocab7473',
    'vocab2440',
    'vocab2947',
    'vocab4825',
    'vocab6521',
    'vocab2231',
    'vocab19529',
    'vocab7342',
    'vocab14230',
    'vocab17146',
    'vocab4853',
    'vocab12408',
    'vocab5627',
    'vocab10344',
    'vocab14079',
    'vocab8396',
    'n_sentences_words_geq50_lt5000',
    'vocab4767',
    'vocab12079',
    'vocab8624',
    'vocab1336',
    'vocab19263',
    'vocab2426',
    'vocab2186',
    'vocab13802',
    'vocab10649',
    'vocab10474',
    'event3',
    'vocab2261',
    'vocab823',
    'vocab16805',
    'vocab13147',
    'vocab5280',
    'vocab5599',
    'vocab6460',
    'vocab6828',
    'vocab12122',
    'vocab1032',
    'vocab6461',
    'vocab7004',
    'vocab3108',
    'vocab2661',
    'vocab13972',
    'vocab3606',
    'vocab5766',
    'vocab5437',
    'vocab694',
    'vocab8638',
    'vocab4545',
    'vocab6698',
    'vocab4984',
    'vocab1306',
    'n_paragraphs_with_n_sentences_geq10_lt20_frac',
    'n_paragraphs_with_n_sentences_geq4_lt5',
    'vocab6669',
    'vocab1153',
    'vocab293',
    'vocab2962',
    'vocab33',
    'vocab8783',
    'vocab12728',
    'vocab8845',
    'vocab16987',
    'vocab4811',
    'n_paragraphs_with_n_sentences_geq2_lt3_frac',
    'vocab7458',
    'vocab6927',
    'vocab6506',
    'vocab4971',
    'vocab4341',
    'vocab1754',
    'vocab10431',
    'vocab14373',
    'vocab10989',
    'vocab2544',
    'vocab13731',
    'vocab2353',
    'vocab16513',
    'vocab9101',
    'vocab2189',
    'vocab10331',
    'vocab14123',
    'vocab4720',
    'vocab3152',
    'vocab3781',
    'vocab4340',
    'vocab14519',
    'vocab219',
    'vocab10303',
    'vocab17881',
    'vocab8452',
    'vocab4343',
    'vocab4941',
    'vocab10594',
    'vocab17160',
    'vocab11129',
    'latency_time_3.0_time_window',
    'vocab13929',
    'vocab10636',
    'vocab4618',
    'vocab6154',
    'vocab8830',
    'vocab6712',
    'vocab8451',
    'vocab8507',
    'vocab7189',
    'vocab14271',
    'vocab8953',
    'vocab14065',
    'vocab3635',
    'vocab15798',
    'vocab8652',
    'vocab12138',
    'vocab3675',
    'vocab4560',
    'vocab16875',
    'vocab404',
    'vocab17784',
    'vocab12178',
    'vocab6654',
    'vocab18825',
    'vocab18495',
    'vocab10249',
    'vocab8407',
    'vocab388',
    'vocab4342',
    'vocab2543',
    'vocab3334',
    'vocab14258',
    'vocab4718',
    'vocab7651',
    'vocab206',
    'vocab2246',
    'vocab2749',
    'vocab2976',
    'vocab7313',
    'vocab1363',
    'vocab1632',
    'vocab4386',
    'vocab7046',
    'vocab17044',
    'vocab2545',
    'n_paragraphs_with_n_sentences_geq10_lt20',
    'vocab2632',
    'vocab5338',
    'vocab17675',
    'vocab15319',
    'vocab48',
    'vocab582',
    'vocab5926',
    'vocab519',
    'vocab2736',
    'vocab4604',
    'vocab11644',
    'vocab3513',
    'vocab12868',
    'vocab20',
    'vocab10786',
    'vocab387',
    'vocab4344',
    'vocab12757',
    'vocab419',
    'vocab4784',
    'vocab12277',
    'vocab11075',
    'vocab3836',
    'vocab17173',
    'vocab4916',
    'vocab4445',
    'vocab17279',
    'vocab1003',
    'vocab8493',
    'vocab5465',
    'vocab5138',
    'vocab9115',
    'vocab2603',
    'vocab580',
    'vocab221',
    'vocab4737',
    'vocab220',
    'vocab3275',
    'vocab12317',
    'vocab5324',
    'vocab265',
    'vocab9130',
    'vocab11245',
    'vocab62',
    'vocab12232',
    'vocab10460',
    'vocab5267',
    'n_sentences_words_geq0_lt5_frac',
    'vocab4356',
    'vocab2575',
    'vocab11142',
    'vocab2542',
    'vocab2371',
    'vocab236',
    'vocab252',
    'vocab2792',
    'vocab10262',
    'vocab2675',
    'vocab405',
    'vocab7907',
    'vocab462',
    'vocab2574',
    'event2',
    'vocab2385',
    'vocab4752',
    'vocab866',
    'vocab4459',
    'vocab2738',
    'vocab8938',
    'vocab5780',
    'event5',
    'vocab266',
    'activity_Input_time_norm_1.0_time_window',
    'vocab6640',
    'vocab1519',
    'vocab4605',
    'vocab8378',
    'vocab61',
    'vocab3098',
    'vocab417',
    'vocab7356',
    'vocab11371',
    'vocab4562',
    'vocab10948',
    'vocab10416',
    'vocab35',
    'vocab4631',
    'vocab13998',
    'vocab10234',
    'vocab4590',
    'vocab1047',
    'vocab6801',
    'vocab6830',
    'vocab2604',
    'vocab4930',
    'vocab1197',
    'vocab2188',
    'vocab852',
    'vocab2400',
    'vocab4735',
    'vocab7487',
    'vocab6642',
    'vocab2190',
    'latency_time_4.0_time_window',
    'vocab6611',
    'vocab4839',
    'vocab5268',
    'vocab2918',
    'vocab8693',
    'vocab418',
    'vocab434',
    'vocab10622',
    'n_parenthetical_punctuation',
    'is_new_burst_start_1.0_time_window',
    'vocab10582',
    'vocab2427',
    'vocab8784',
    'vocab4558',
    'vocab12451',
    'vocab6432',
    'vocab12423',
    'words_length_p50',
    'vocab4719',
    'vocab4355',
    'vocab203',
    'vocab9315',
    'vocab3305',
    'vocab1294',
    'vocab4357',
    'vocab13830',
    'vocab11397',
    'is_new_burst_start_Input_1.0_time_window',
    'vocab479',
    'vocab10828',
    'vocab5012',
    'vocab977',
    'vocab4348',
    'vocab6550',
    'vocab4768',
    'vocab2588',
    'vocab4751',
    'vocab3471',
    'vocab2275',
    'vocab15376',
    'vocab5451',
    'vocab16957',
    'vocab17133',
    'vocab6857',
    'vocab6464',
    'vocab491',
    'vocab595',
    'vocab280',
    'vocab12570',
    'vocab2573',
    'vocab8422',
    'vocab9420',
    'vocab2752',
    'vocab4753',
    'vocab13017',
    'vocab4327',
    'vocab5353',
    'vocab5037',
    'vocab3095',
    'vocab5111',
    'vocab4918',
    'is_new_burst_start_Input_0.0_time_window',
    'vocab2589',
    'vocab10977',
    'vocab764',
    'vocab2992',
    'n_paragraphs_with_n_sentences_geq2_lt3',
    'vocab7624',
    'vocab3621',
    'vocab3166',
    'activity_Nonproduction_3.0_time_window',
    'vocab608',
    'vocab16834',
    'vocab2763',
    'vocab15950',
    'is_new_activity_streak_start_Input_4.0_time_window',
    'vocab4740',
    'cursor_position_vs_max_p50',
    'activity_Input_7.0_time_window',
    'vocab13391',
    'vocab10787',
    'vocab2386',
    'latency_time_frac_total_3.0_time_window',
    'vocab222',
    'vocab4354',
    'vocab11271',
    'vocab2384',
    'vocab20839',
    'vocab2735',
    'vocab797',
    'vocab6493',
    'vocab5152',
    'activity_Nonproduction_frac_total_0.0_time_window',
    'vocab3139',
    'vocab8968',
    'vocab15935',
    'is_new_activity_streak_start_Input_5.0_time_window',
    'vocab4769',
    'vocab10581',
    'preceding_pause_time_0.0_time_window',
    'vocab385',
    'vocab389',
    'vocab7300',
    'vocab12248',
    'vocab6871',
    'vocab435',
    'vocab5640',
    'n_sentences_words_geq0_lt5',
    'vocab3292',
    'vocab240',
    'vocab4369',
    'vocab12742',
    'vocab12151',
    'vocab13985',
    'vocab3347',
    'vocab8871',
    'vocab17029',
    'activity_Input_time_norm_0.0_time_window',
    'vocab6727',
    'latency_time_frac_total_2.0_time_window',
    'vocab2590',
    'vocab1493',
    'vocab5181',
    'vocab12049',
    'vocab50',
    'vocab6977',
    'vocab3033',
    'vocab14400',
    'preceding_pause_time_frac_total_1.0_time_window',
    'activity_Input_frac_total_3.0_time_window',
    'vocab2935',
    'vocab5110',
    'vocab8436',
    'vocab10488',
    'vocab12599',
    'vocab10501',
    'vocab14094',
    'vocab12331',
    'vocab8814',
    'vocab2300',
    'is_new_activity_streak_start_Remove/Cut_time_norm_2.0_time_window',
    'activity_Remove/Cut_0.0_time_window',
    'vocab609',
    'vocab12438',
    'vocab975',
    'vocab191',
    'vocab11191',
    'vocab4370',
    'vocab8133',
    'vocab15483',
    'vocab6656',
    'vocab11752',
    'vocab6437',
    'is_new_activity_streak_start_Input_frac_total_6.0_time_window',
    'activity_Remove/Cut_frac_total_1.0_time_window',
    'vocab9158',
    'vocab294',
    'vocab3111',
    'vocab4956',
    'vocab2822',
    'vocab11089',
    'vocab2388',
    'vocab7202',
    'vocab7753',
    'vocab9572',
    'vocab10584',
    'is_new_burst_start_frac_total_2.0_time_window',
    'vocab8380',
    'vocab13789',
    'vocab7328',
    'vocab5097',
    'vocab4933',
    'vocab238',
    'event29',
    'vocab2399',
    'vocab2548',
    'vocab204',
    'vocab5613',
    'vocab4913',
    'vocab2546',
    'is_new_burst_start_Input_time_norm_0.0_time_window',
    'word_count_delta_frac_total_7.0_time_window',
    'vocab963',
    'is_new_activity_streak_start_Nonproduction_1.0_time_window',
    'vocab948',
    'vocab5096',
    'vocab2750',
    'vocab3096',
    'vocab17057',
    'activity_Nonproduction_frac_total_1.0_time_window',
    'vocab2558',
    'vocab8667',
    'vocab4345',
    'word_count_delta_1.0_time_window',
    'is_new_activity_streak_start_Move_0.0_time_window',
    'vocab8596',
    'activity_Input_time_norm_7.0_time_window',
    'vocab406',
    'vocab10236',
    'vocab8437',
    'vocab5095',
    'vocab6658',
    'vocab5296',
    'activity_Nonproduction_frac_total_4.0_time_window',
    'vocab14244',
    'is_new_burst_start_Move_frac_total_0.0_time_window',
    'vocab5753',
    'vocab10597',
    'vocab6448',
    'vocab781',
    'vocab11077',
    'vocab1545',
    'vocab3526',
    'is_new_activity_streak_start_Remove/Cut_4.0_time_window',
    'vocab11631',
    'vocab4736',
    'vocab3318',
    'vocab4754',
    'vocab782',
    'vocab1645',
    'vocab2217',
    'vocab6491',
    'vocab422',
    'vocab3335',
    'vocab4388',
    'vocab450',
    'vocab4359',
    'vocab2795',
    'is_new_burst_start_Input_2.0_time_window',
    'vocab964',
    'vocab2934',
    'vocab796',
    'vocab3823',
    'vocab8940',
    'vocab10801',
    'vocab4328',
    'vocab8755',
    'vocab2204',
    'vocab4957',
    'vocab607',
    'activity_Replace_frac_total_0.0_time_window',
    'vocab578',
    'preceding_pause_time_frac_total_0.0_time_window',
    'vocab15514',
    'vocab4770',
    'vocab15665',
    'vocab6742',
    'is_new_activity_streak_start_Paste_per_s',
    'vocab9288',
    'is_new_burst_start_Nonproduction_time_norm_1.0_time_window',
    'vocab18221',
    'vocab5507',
    'vocab6803',
    'is_new_activity_streak_start_Input_time_norm_5.0_time_window',
    'vocab224',
    'vocab5139',
    'word_count_delta_time_norm_6.0_time_window',
    'vocab1154',
    'vocab3485',
    'latency_time_1.0_time_window',
    'vocab10664',
    'vocab10250',
    'vocab5098',
    'vocab2920',
    'vocab15361',
    'vocab10220',
    'vocab1157',
    'vocab7637',
    'activity_Input_frac_total_1.0_time_window',
    'vocab4559',
    'vocab15347',
    'vocab6419',
    'vocab2547',
    'is_new_burst_start_Nonproduction_time_norm_2.0_time_window',
    'activity_Paste_frac_total_entropy',
    'vocab13943',
    'n_paragraphs_with_n_sentences_geq6_lt7',
    'vocab4578',
    'vocab10235',
    'vocab2766',
    'vocab8392',
    'vocab795',
    'vocab825',
    'vocab3399',
    'is_new_activity_streak_start_Input_frac_total_1.0_time_window',
    'vocab2977',
    'vocab6447',
    'activity_Nonproduction_time_norm_0.0_time_window'
    ]

In [None]:
def extract(path):

    X = pd.read_csv(path)
    X = X.sort_values(["id", "event_id"], ascending=[True, True])
    
    return X

In [None]:
def enrich_activity(X, is_training_run):

    # 'Move From' activity recorded with low-level cursor loc details
    # extract bigger-picture 'Move From'
    # QUESTION: what's the difference between Move From, and a cut+paste?
    X['activity_detailed'] = X['activity']
    X.loc[X['activity'].str.contains('Move From'), 'activity'] = 'Move'

    if is_training_run:

        pipeline = ColumnTransformer(
            transformers=[(
                'onehot_encode', 
                preprocessing.OneHotEncoder(
                    categories=[ACTIVITY_CATEGORIES], 
                    sparse=False, 
                    handle_unknown='infrequent_if_exist'
                    ),
                ["activity"]
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )
        
        pipeline.fit(X)

        with open("pipeline_activity_onehot.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_activity_onehot.pkl", "rb") as f:
            pipeline = pickle.load(f)

    original_categorical = X[['activity']]

    X_dtypes = X.dtypes.to_dict()
    X = pipeline.transform(X)
    X = pd.DataFrame(X, columns=pipeline.get_feature_names_out())
    X = pd.concat([X, original_categorical], axis=1)
    X = X.astype(X_dtypes)

    return X

In [None]:
def scrub_text_change(X):
    """
    Problems with initial text data:

    - Some hex expressions (\\xHH) not decoded. Instead, written literally.
        - Examples: emdash (\\x96), slanted quotations & ticks.
        
    - Some foreign characters (accent a, overring a) not anonymized with generic q.
    Problem confirmed via Kaggle data viewer, for id-event_id cases like 
    0916cdad-39 or 9f328eb3-19. Solutions:
        - An Input event cannot include multiple characters: 
        foreign character & something else. 
        Then, 
            - If Input event contains any emdash, overwrite as strictly emdash
            - If Input event contains no emdash & foreign character, overwrite with single q
            - If Move event, replace any foreign character with single q
    """

    X['text_change_original'] = X['text_change']

    # expect this transforms all \xHH literals
    X['text_change'] = (
        X
        ['text_change_original']
        # arrived at utf-8 encode, windows-1252 decode after several iterations.
        # tested latin-1, but not all \xHH instances caught.
        # tested utf-16, just rose errors.
        .apply(lambda x: x.encode(encoding='utf-8').decode("windows-1252"))
    )


    is_text_change_decode_english = (
        X['text_change'].apply(lambda x: x.isascii())
    )

    is_input_event_foreign_any_emdash = (
        (~ is_text_change_decode_english)
        & (X['activity'] == "Input") 
        & (X['text_change'].str.contains("—"))
    )
    X.loc[is_input_event_foreign_any_emdash, 'text_change'] = "—"

    is_input_event_foreign_no_overwrite = (
        (~ is_text_change_decode_english)
        & (X['activity'] == "Input")
        & (~ X['text_change'].str.contains("—"))
    )
    X.loc[is_input_event_foreign_no_overwrite, 'text_change'] = 'q'


    # given block text change, proceed one character at a time,
    # replacing foreign ones 
    def anonymize_non_ascii(x):
        value = ""
        for x_i in x:
            if not x_i.isascii():
                value += "q"
            else:
                value += x_i
        return value

    X['text_change'] = np.where(
        X['activity'].str.contains('Move|Remove|Paste|Replace', regex=True),
        X['text_change'].apply(lambda x: anonymize_non_ascii(x)),
        X['text_change']
    )

    X.drop(columns='text_change_original', inplace=True)

    return X

In [None]:
PAUSE_THRESHOLD_MS = 1000
N_ACTIVITIES_UNTIL_START_WINDOW_CLOSES = 100

def enrich_pauses(X):
    """
    Must infer pauses, as no explicit record indicates.
    'Latency' implies, any time delta between keystrokes.
    'Pause' implies, a 'significant' time delta, not just physical-mechanical
    requirement of typing.
    """

    X['up_time_lag1'] = X.groupby(['id'])['up_time'].shift(1)
    X['latency_time'] = X['down_time'] - X['up_time_lag1']

    X['preceding_pause_time'] = X['latency_time']
    # first record lacks preceding_pause_time (time before first key press)
    X.loc[X['event_id'] == 1, 'preceding_pause_time'] = X['down_time']
    # expect some negative pause times -- interpret as, no real pause
    has_no_real_pause = X['preceding_pause_time'] <= PAUSE_THRESHOLD_MS
    X.loc[has_no_real_pause, 'preceding_pause_time'] = None

    # not obvious how to tag "initial planning pause" 
    # tried "first 5 minutes", but when that pause is 10 minutes, that fails.
    # first XX minutes is fragile
    # first XX events may help -- what's your extent of pause before *action*?
    X['preceding_pause_time_start_window'] = X['preceding_pause_time']
    X.loc[
        X['event_id'] <= N_ACTIVITIES_UNTIL_START_WINDOW_CLOSES, 
        'preceding_pause_time_start_window'
        ] = None

    X['total_pause_time'] = (
        X
        .groupby(['id'])
        ['preceding_pause_time']
        .transform('sum')
        )
    X['rolling_pause_time'] = (
        X
        .groupby(['id'])
        ['preceding_pause_time']
        .cumsum()
        )
    X['rolling_pause_time_fraction'] = (
        X['rolling_pause_time'] / X['total_pause_time']
        )

    return X

In [None]:
SECONDS_PER_BURST = 2

def enrich_time_bursts(X, is_training_run):
    """
    If pause exceeds threshold duration, a "burst" has ended. 
    A burst is characterized by one dominant activity.
    """

    X['is_new_burst_start'] = (
        X['preceding_pause_time'] > MS_PER_S * SECONDS_PER_BURST
        ).astype(int)
    X.loc[X['event_id'] == 1, 'is_new_burst_start'] = 1
    X['burst_id'] = (
        X
        .groupby(['id'])
        ['is_new_burst_start']
        .cumsum()
        )
    X['burst_time_start'] = (
        X
        .groupby(['id', 'burst_id'])
        ['down_time']
        .transform('min')
        )
    X['burst_time_end'] = (
        X
        .groupby(['id', 'burst_id'])
        ['up_time']
        .transform('max')
        )
    X['burst_time_duration'] = X['burst_time_end'] - X['burst_time_start']
    

    for activity in ACTIVITY_CATEGORIES:

        X['burst_events_' + activity] = (
            X
            .groupby(['id', 'burst_id'])
            ['activity_' + activity]
            .transform('sum')
            ).astype(float)
        
    X['burst_type'] = (
        X
        [['burst_events_' + activity for activity in ACTIVITY_CATEGORIES]]
        .idxmax(axis=1)
        )
    X['burst_type'] = X['burst_type'].str.replace(
        "burst_events_", "", regex=True
        )


    if is_training_run:
        
        pipeline = ColumnTransformer(
            transformers=[(
                'onehot_encode', 
                preprocessing.OneHotEncoder(
                    categories=[ACTIVITY_CATEGORIES], 
                    sparse=False, 
                    handle_unknown='infrequent_if_exist'
                    ),
                ["burst_type"]
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )
        
        pipeline.fit(X)
        
        with open("pipeline_burst_type_onehot.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_burst_type_onehot.pkl", "rb") as f:
            pipeline = pickle.load(f)

    original_categorical = X['burst_type']
    X_dtypes = X.dtypes.to_dict()
    X = pipeline.transform(X)
    X = pd.DataFrame(X, columns=pipeline.get_feature_names_out())
    X = pd.concat([X, original_categorical], axis=1)
    X = X.astype(X_dtypes)

    for activity in ACTIVITY_CATEGORIES:
        X['is_new_burst_start_' + activity] = (
            X['is_new_burst_start'] * 
            X['burst_type_' + activity]
            )
    
    return X

In [None]:
def enrich_activity_streaks(X):
    """
    Consecutive activity (independent of time) suggests productive writing flow 
    """

    X['activity_lag1'] = X.groupby(['id'])['activity'].shift(1)

    X['is_new_activity_streak_start'] = (
        X['activity'] != X['activity_lag1']
        ).astype(int)
    X.loc[X['event_id'] == 1, 'is_new_activity_streak_start'] = 1

    X['is_activity_streak_end'] = (
        X
        .groupby(['id'])
        ['is_new_activity_streak_start']
        .shift(-1)
        )
    X['is_activity_streak_end'] = X['is_activity_streak_end'].fillna(1) 

    X['activity_streak_id'] = (
        X
        .groupby(['id'])
        ['is_new_activity_streak_start']
        .cumsum()
    )

    X['activity_streak_length_thin'] = (
        X
        .groupby(['id', 'activity_streak_id'])
        .transform('size')
    )
    X.loc[
        X['is_activity_streak_end'] == 0, 
        'activity_streak_length_thin'
        ] = None

    for activity in ACTIVITY_CATEGORIES:
        X['is_new_activity_streak_start_' + activity] = (
            X["activity_" + activity] * X['is_new_activity_streak_start']
            )

    return X

In [None]:
def enrich_word_count(X):
    """
    Word count is a primary productivity measure. 
    Expect score to increase with word count.
    """

    X['word_count_lag1'] = X.groupby(['id'])['word_count'].shift(1)
    X['word_count_delta'] = X['word_count'] - X['word_count_lag1']

    X['word_count_delta_burst'] = (
        X
        .groupby(['id', 'burst_id'])
        ['word_count_delta']
        .transform('sum')
        )
    # de-duplication allows easier downstream aggregation
    X['word_count_delta_burst_thin'] = X['word_count_delta_burst']
    X.loc[X['is_new_burst_start'] == 0, 'word_count_delta_burst_thin'] = None

    X['word_count_delta_activity_streak'] = (
        X
        .groupby(['id', 'activity_streak_id'])
        ['word_count_delta']
        .transform('sum')
        )
    # de-duplicate to one value per burst -- easier for downstream aggregation
    X['word_count_delta_activity_streak_thin'] = X['word_count_delta_activity_streak']
    X.loc[
        X['is_new_activity_streak_start'] == 0, 
        'word_count_delta_activity_streak_thin'
        ] = None


    return X

In [None]:
def enrich_cursor_position(X):
    """
    Theory: one-way cursor movement might be more productive, vs jumping around.
    """

    X['cursor_position_lag1'] = (
        X
        .groupby(['id'])
        ['cursor_position']
        .shift(1)
        ).fillna(0)
    X['cursor_position_delta'] = X['cursor_position'] - X['cursor_position_lag1'] 

    # if cursor position increases due to copy+paste (perhaps of essay prompt),
    # that doesn't reflect grade-driving output
    X['cursor_position_input'] = np.where(
        X['activity'] == "Input", 
        X["cursor_position"], 
        np.nan
        )
    X['cursor_position_cummax'] = X.groupby(['id'])['cursor_position_input'].cummax()

    # for some reason, unable to chain below statements with above
    X['cursor_position_cummax'] = (
        X
        .groupby(['id'])
        ['cursor_position_cummax']
        .ffill()
        .fillna(0)
    )

    X['cursor_position_vs_max'] = (
        X['cursor_position'] - X['cursor_position_cummax']
        )

    X = X.drop(columns='cursor_position_input')

    return X

In [None]:
TOTAL_MIN_MAX_EXPECTED = 30
TOTAL_MIN_PLUS_BUFFER = 150 # id 21bbc3f6 case extended to 140 min ... odd
SECONDS_PER_MIN = 60
SECONDS_PER_WINDOW = 30

def enrich_time_windows(X):

    # windows allow for time-sequence features
    # expect that some essays extend beyond 30 min described in 'Data Collection'
    # downstream, **do not tabulate over a writer's unused time windows**!!

    X['window_30s'] = pd.cut(
        X['down_time'],
        bins=np.arange(
            0, 
            TOTAL_MIN_PLUS_BUFFER * SECONDS_PER_MIN * MS_PER_S, 
            SECONDS_PER_WINDOW * MS_PER_S
            )
        )

    X['is_time_beyond_expected_max'] = (
        X['up_time'] > TOTAL_MIN_MAX_EXPECTED * SECONDS_PER_MIN * MS_PER_S
    ).astype(int)

    return X

In [None]:
def subset_features(X):

    return X[[
        "id",
        "event_id",
        "is_time_beyond_expected_max",
        "window_30s",
        "burst_id",
        "burst_type",
        "burst_type_Nonproduction",
        "burst_type_Input",
        "burst_type_Remove/Cut",
        "burst_type_Replace",
        "burst_type_Paste",
        "burst_type_Move",
        "is_new_burst_start",
        "is_new_burst_start_Nonproduction",
        "is_new_burst_start_Input",
        "is_new_burst_start_Remove/Cut",
        "is_new_burst_start_Replace",
        "is_new_burst_start_Paste",
        "is_new_burst_start_Move",
        "burst_time_start",
        "burst_time_end",
        "burst_time_duration",
        "burst_events_Nonproduction",
        "burst_events_Input",
        "burst_events_Remove/Cut",
        "burst_events_Replace",
        "burst_events_Paste",
        "burst_events_Move",
        "word_count_delta_burst",
        "word_count_delta_burst_thin",
        "activity_streak_id",
        "is_new_activity_streak_start",
        "is_new_activity_streak_start_Nonproduction",
        "is_new_activity_streak_start_Input",
        "is_new_activity_streak_start_Remove/Cut",
        "is_new_activity_streak_start_Replace",
        "is_new_activity_streak_start_Paste",
        "is_new_activity_streak_start_Move",
        "is_activity_streak_end",
        "activity_streak_length_thin",
        "word_count_delta_activity_streak",
        "word_count_delta_activity_streak_thin",

        "down_time",
        "up_time",	
        "action_time",	
        "activity_detailed",
        "activity",	
        "activity_Nonproduction",
        "activity_Input",
        "activity_Remove/Cut",
        "activity_Replace",
        "activity_Paste",
        "activity_Move",
        "down_event",	
        "up_event",	
        "text_change",
        "cursor_position",	
        "word_count",

        "cursor_position_delta",
        "cursor_position_vs_max",
        "cursor_position_cummax",

        "word_count_lag1",
        "word_count_delta",

        "up_time_lag1",
        "latency_time",
        "preceding_pause_time",
        "preceding_pause_time_start_window",
        "rolling_pause_time",
        "rolling_pause_time_fraction",
        "total_pause_time"
        ]]  

In [None]:
def concatenate_essay_from_logs(df):
    """
    Concatenate essay text from disparate logged input events.
    Expect df to be *one* author's log.
    Adapted from sources: 
        https://www.kaggle.com/code/hiarsl/feature-engineering-sentence-paragraph-features,
        https://www.kaggle.com/code/kawaiicoderuwu/essay-contructor.
    """

    input_events = df.loc[
        (df.activity != 'Nonproduction'), 
        ['activity_detailed', 'cursor_position', 'text_change']
        ].rename(columns={'activity_detailed': 'activity'})

    essay_text = ""
    for input_event in input_events.values:

        activity = input_event[0]
        cursor_position_after_event = input_event[1]
        text_change_log = input_event[2]

        if activity == 'Replace':

            replace_from_to = text_change_log.split(' => ')
            text_add = replace_from_to[1]
            text_remove = replace_from_to[0]
            cursor_position_start_text_change = (
                cursor_position_after_event - len(text_add)
                )
            cursor_position_after_skip_replace = (
                cursor_position_start_text_change + len(text_remove)
            )

            # essayText start: "the blue cat"
            # replace "blue" with "red"
            # "the redblue cat", skip blue
            essay_text = (
                essay_text[:cursor_position_start_text_change] # "the "
                + text_add # "red"
                # essayText value: "the blue cat" 
                # want remaining " cat", NOT "blue cat"
                + essay_text[cursor_position_after_skip_replace:] 
                )

            continue

        if activity == 'Paste':

            cursor_position_start_text_change = (
                cursor_position_after_event - len(text_change_log)
                )

            # essayText start: "the cat"
            # paste "blue " between
            essay_text = (
                essay_text[:cursor_position_start_text_change] # "the " 
                + text_change_log # "blue "
                # essayText value: "the cat"
                + essay_text[cursor_position_start_text_change:]
            )

            continue

        if activity == 'Remove/Cut':
            # similar process to "Replace" action

            text_remove = text_change_log
            cursor_position_after_skip_remove = (
                cursor_position_after_event + len(text_remove)
            )

            essay_text = (
                essay_text[:cursor_position_after_event] 
                + essay_text[cursor_position_after_skip_remove:]
                )

            continue
        
        if "Move" in activity:

            cursor_intervals_raw_str = (
                activity[10:]
                .replace("[", "")
                .replace("]", "")
                )
            cursor_intervals_separate = cursor_intervals_raw_str.split(' To ')
            cursor_intervals_vectors = [
                x.split(', ') 
                for x in cursor_intervals_separate
                ]
            cursor_interval_from = [
                int(x) for x in cursor_intervals_vectors[0]
                ]
            cursor_interval_to = [
                int(x) for x in cursor_intervals_vectors[1]
                ]

            # "the blue cat ran", move "blue" to
            # "the cat blue ran"
            # note: no change in total text length

            if cursor_interval_from[0] != cursor_interval_to[0]:

                if cursor_interval_from[0] < cursor_interval_to[0]:
                    
                    essay_text = (
                        # all text preceding move-impacted window
                        essay_text[:cursor_interval_from[0]] +
                        # skip where moved block _was_,
                        # proceed to end of move-impacted window
                        essay_text[cursor_interval_from[1]:cursor_interval_to[1]] +
                        # add moved block
                        essay_text[cursor_interval_from[0]:cursor_interval_from[1]] + 
                        # all text proceeding move-impacted window
                        essay_text[cursor_interval_to[1]:]
                    )

                # "the cat ran fast", move "ran" to 
                # "ran the cat fast"
                else:

                    essay_text = (
                        # all text preceding move-impacted window
                        essay_text[:cursor_interval_to[0]] + 
                        # add moved block
                        essay_text[cursor_interval_from[0]:cursor_interval_from[1]] +
                        # skip moved block, still within move-impacted window
                        essay_text[cursor_interval_to[0]:cursor_interval_from[0]] + 
                        # all text proceeding move-impacted window
                        essay_text[cursor_interval_from[1]:]
                    )
      
            continue
        

        cursor_position_start_text_change = (
            cursor_position_after_event - len(text_change_log)
            )
        essay_text = (
            essay_text[:cursor_position_start_text_change] 
            + text_change_log
            + essay_text[cursor_position_start_text_change:]
            )
        
    return pd.DataFrame({'id': df['id'].unique(), 'essay': [essay_text]})

In [None]:
def enrich_logs(X, is_training_run):

    X = enrich_activity(X, is_training_run)
    print("Enriched activity")

    # live test data raise Exception during decode-encode attempt.
    # still, higher quality model should follow from 
    # higher-quality train data 
    if is_training_run:
        X = scrub_text_change(X)

    X = enrich_pauses(X)
    print("Enriched pauses")

    X = enrich_time_bursts(X, is_training_run)
    print("Enriched time bursts")

    X = enrich_activity_streaks(X)
    print("Enriched activity streaks")

    X = enrich_word_count(X)
    print("Enriched word count")

    X = enrich_cursor_position(X)
    print("Enriched cursor position")

    X = enrich_time_windows(X)
    print("Enriched time windows")

    return subset_features(X)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize_essay_text(
    X, is_training_run, vocabulary=PRETRAINED_TEXT_VOCABULARY
    ):
    """
    Given higher-order ngram, expect large vocabulary for vectorizer.
    Might prefer pre-trained vocabulary with known phrase-index mappings:
    where indexes have been pre-screened for importance in feature selection.
    """

    essays_text = pd.concat([
        concatenate_essay_from_logs(x) 
        for _, x in X.groupby('id')
        ], axis=0).reset_index(drop=True)
    
    corpus = essays_text['essay'].to_list()

    if vocabulary:
        
        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1, 4),
            vocabulary=vocabulary
            )
    
    elif is_training_run:
        
        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1, 4)
            )

        pipeline.fit(corpus)

        with open("pipeline_text_vectorizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_text_vectorizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    essay_vectorized = pipeline.transform(corpus)
    essay_vectorized = pd.DataFrame(
        essay_vectorized.toarray(),
        index=essays_text['id'].values
        )
    essay_vectorized.columns = [
        'vocab' + str(x) for x in essay_vectorized.columns
        ]

    return essay_vectorized

In [None]:
def vectorize_events(
    X, is_training_run, vocabulary=PRETRAINED_EVENTS_VOCABULARY
    ):
    """
    A keylog "event" differs from an activity. Event examples include:
    leftclick, rightclick, capslock, arrow{direction}, ...
    Why calculate? Competition has found value in these features.
    """
    
    expr = {'down_event_seq': " ".join}
    X_events = X.groupby('id')['down_event'].agg(**expr).reset_index(drop=False)

    corpus = X_events['down_event_seq'].to_list()

    if vocabulary:

        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1,1),
            vocabulary=vocabulary
            )

    elif is_training_run:

        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1,1)
            )

        pipeline.fit(corpus)

        with open("pipeline_events_vectorizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_events_vectorizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    events_vectorized = pipeline.transform(corpus)
    events_vectorized = pd.DataFrame(
        events_vectorized.toarray(),
        index=X_events['id'].values
        )
    events_vectorized.columns = [
        'event' + str(x) for x in events_vectorized.columns
        ]

    return events_vectorized
    

In [None]:
def aggregate_essay_text_features(X):
    """
    Aggregates covering final writing product, not writing process narrowly.
    """

    essays_text = pd.concat(
        [concatenate_essay_from_logs(x) for _, x in X.groupby('id')], axis=0
        ).reset_index(drop=True)
    
    # two consecutive newlines constitute one effective
    # no paragraph breaks imply, all 1 paragraph
    essays_text['n_paragraphs'] = essays_text['essay'].str.count("[\n]+")
    essays_text.loc[essays_text['n_paragraphs'] == 0, 'n_paragraphs'] = 1
    essays_text['paragraphs'] = essays_text['essay'].str.split("[\n]+")
    essays_text['n_sentences_by_paragraph'] = (
        essays_text['paragraphs']
        .apply(lambda paragraphs: np.array([
            len(re.findall("[\.]+|[?]+|[!]+", p)) 
            for p in paragraphs
            ]) 
            )
        )
    # for bounds guidance, see overall distribution
    varnames_n_paragraphs_by_n_sentences_bin = []
    for geq_low, lt_high in [
        (0, 2),
        (2, 3),
        (3, 4),
        (4, 5),
        (5, 6),
        (6, 7),
        (7, 10),
        (10, 20),
        (20, 50)
        ]:

        bin_var = f'n_paragraphs_with_n_sentences_geq{geq_low}_lt{lt_high}'
        varnames_n_paragraphs_by_n_sentences_bin += [bin_var, bin_var + "_frac"]

        essays_text[bin_var] = (
            essays_text['n_sentences_by_paragraph']
            .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
            )
        
        essays_text[bin_var + "_frac"] = (
            essays_text[bin_var] / essays_text['n_paragraphs']
            )


    # sentences split can leave last hanging ' ', 
    # if not scrubbed by search for 'q'
    essays_text['sentences'] = essays_text['essay'].str.split("[\.]+|[?]+|[!]+")
    essays_text['sentences'] = (
        essays_text['sentences']
        .apply(lambda sentences: [s for s in sentences if 'q' in s])
    )
    essays_text['n_sentences'] = (
        essays_text['sentences']
        .apply(lambda s_split: len(s_split))
    )

    essays_text['words_by_sentence'] = (
        essays_text['sentences']
        .apply(lambda sentences: [s.split() for s in sentences])
    )
    essays_text['i_words_by_sentence'] = (
        essays_text['words_by_sentence']
        .apply(lambda sentences: np.array([len(s) for s in sentences]))
    )

    # for bounds guidance, see overall distribution
    varnames_n_sentences_by_word_count_bin = []
    for geq_low, lt_high in [
        (0, 5),
        (5, 10),
        (10, 15),
        (15, 20),
        (20, 25),
        (25, 30),
        (30, 50),
        (50, 5000)
        ]:

        bin_var = f'n_sentences_words_geq{geq_low}_lt{lt_high}'
        varnames_n_sentences_by_word_count_bin += [bin_var, bin_var + "_frac"]

        essays_text[bin_var] = (
            essays_text['i_words_by_sentence']
            .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
            )
        
        essays_text[bin_var + "_frac"] = (
            essays_text[bin_var] / essays_text['n_sentences']
            )


    essays_text['words'] = essays_text['essay'].str.split(" +", regex=True)
    essays_text["word_count_reconstructed"] = (
        essays_text
        ["words"]
        .apply(lambda x: len(x))
    )
    essays_text["words_length"] = (
        essays_text["words"]
        .apply(lambda x: np.array([len(a) for a in x]))
    )

    # for bounds guidance, see distribution of word lengths
    varnames_i_words_by_length_bin = []
    for geq_low, lt_high in [
        (0, 2),
        (2, 3),
        (3, 4),
        (4, 5),
        (5, 6),
        (6, 7),
        (7, 8),
        # "incomprehensible" is a reasonable, long (21-char) word
        (8, 25),
        (25, 500)
    ]:
        bin_var = f'words_length_geq{geq_low}_lt{lt_high}'
        varnames_i_words_by_length_bin += [bin_var, bin_var + "_frac"]

        essays_text[bin_var] = (
            essays_text['words_length']
            .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
            )
        essays_text[bin_var + "_frac"] = (
            essays_text[bin_var] / essays_text['word_count_reconstructed']
            )


    essays_text['n_thought_delimiting_punctuation'] = (
        essays_text
        ['essay']
        .str
        .count("[\.]+|[?]+|[!]+|[,]+|[-]+|[;]+|[:]+|[—]+")
        )
    essays_text["words_per_thought_delimiting_punctuation_avg"] = (
        essays_text["word_count_reconstructed"] / 
        essays_text['n_thought_delimiting_punctuation']
    )
    essays_text['n_commas'] = essays_text['essay'].str.count("[,]")
    essays_text['n_dashes'] = essays_text['essay'].str.count("[-]")
    essays_text['n_semicolons'] = essays_text['essay'].str.count("[;]")
    essays_text['n_questions'] = essays_text['essay'].str.count("[?]")
    essays_text['n_exclaims'] = essays_text['essay'].str.count("[!]")

    essays_text['n_parenthetical_punctuation'] = (
        essays_text
        ['essay']
        .str
        .count("\(|\)|\[|\]|\*|{|}")
    )

    essays_text['n_quant_punctuation'] = (
        essays_text['essay'].str.count("=|>|<|\$|\%|\+")
        )

    essays_text['n_apostrophe'] = essays_text['essay'].str.count("'")

    essays_text['n_quotes'] = essays_text['essay'].str.count("\"")

    essays_text['n_shortening_punctuation'] = (
        essays_text['essay'].str.count("&|@")
        )

    essays_text['n_characters'] = essays_text['essay'].str.len()


    for var in ['i_words_by_sentence', 'words_length']:
        essays_text[f"{var}_mean"] = essays_text[var].apply(lambda x: x.mean())
        essays_text[f"{var}_p50"] = (
            essays_text[var].apply(lambda x: np.nanquantile(x, 0.5))
            )
        essays_text[f"{var}_stddev"] = essays_text[var].apply(lambda x: x.std())


    aggregates_essay_text = essays_text[[
        'id',
        'n_paragraphs', 
        'n_sentences', 
        
        'n_thought_delimiting_punctuation',
        "words_per_thought_delimiting_punctuation_avg",
        'n_parenthetical_punctuation',
        'n_quant_punctuation',
        'n_apostrophe',
        'n_quotes',
        'n_shortening_punctuation',
        "n_commas",
        "n_dashes",
        "n_semicolons",
        "n_questions",
        "n_exclaims",

        "n_characters",

        "i_words_by_sentence_mean",
        "words_length_mean",
        "i_words_by_sentence_p50",
        "words_length_p50",
        "i_words_by_sentence_stddev",
        "words_length_stddev"
        ]

        + varnames_n_paragraphs_by_n_sentences_bin

        + varnames_n_sentences_by_word_count_bin

        + [x for x in varnames_i_words_by_length_bin if '_frac' in x]
        
        ]
    aggregates_essay_text = aggregates_essay_text.set_index('id')

    return aggregates_essay_text

In [None]:
# for meaningful summary of a log field, aggregation may vary.
    # if quantity cumulates, then sum
        # if discrete event, then frequency per unit time also meaningful
    # if quantity's distribution is interesting, summarize
        # if quantity is continuous, describe complete distr by histogramming
    
event_vars_sum = (
    ['activity_' + x for x in ACTIVITY_CATEGORIES] 
    + ['is_new_burst_start'] 
    + ['is_new_burst_start_' + x for x in ACTIVITY_CATEGORIES]
    + ["is_new_activity_streak_start_" + x for x in ACTIVITY_CATEGORIES]
    + ['word_count_delta']
    )

conti_vars_sum = ["preceding_pause_time", "latency_time"]

distribution_vars = [
    'latency_time', 
    'preceding_pause_time', 
    'cursor_position_delta',
    'word_count_delta_burst_thin',
    'word_count_delta_activity_streak_thin',
    'activity_streak_length_thin',
    'cursor_position_vs_max'  
]


def aggregate_no_time_dependence_measures(X, is_training_run):
    """
    Aggregate measures irrespective of time dependence. 
    Ex: sum of inputs over entire essay.
    """

    # discretizing conti var allows sum of vars, as though they were events.
    # because discretization expands columns via one-hot,
    # reduce dataset to small-as-possible.
    # extracting non-float id allows ColumnTransformer's properly typed numpy
    X_attributes = X[['id']]
    X_to_sum = X[event_vars_sum + distribution_vars]
    X_orig_to_sum = X_to_sum[conti_vars_sum].copy()

    if is_training_run:

        pipeline = ColumnTransformer(
            transformers=[(
                'discretizer', 
                preprocessing.KBinsDiscretizer(
                    n_bins=10, 
                    encode='onehot-dense', 
                    strategy='quantile'
                    ),
                distribution_vars
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )

        # if nulls not explicitly handled, Exception raises
        pipeline.fit(X_to_sum.fillna(-1))
        with open("pipeline_no_time_dep_discretizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_no_time_dep_discretizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    # follow pipeline fit nulls treatment
    X_to_sum = pipeline.transform(X_to_sum.fillna(-1))

    X_to_sum = pd.DataFrame(X_to_sum, columns=pipeline.get_feature_names_out())
    X_to_sum = pd.concat([X_attributes, X_to_sum, X_orig_to_sum], axis=1)
    # cols_in = set(pipeline.feature_names_in_)
    # cols_out = set(pipeline.get_feature_names_out())
    # distribution_vars_discretized = cols_out.difference(cols_in)

    # X_to_sum['nobs'] = 1
    # with distribution_vars discretized, everything sums
    sums_over_time = X_to_sum.groupby('id').agg(sum)
    # for var in distribution_vars_discretized:
    #     sums_over_time[var + '_share_distr'] = (
    #         sums_over_time[var] / sums_over_time['nobs']
    #     )
    # sums_over_time.drop(columns='nobs', inplace=True)
    sums_over_time['delete_insert_ratio'] = (
        sums_over_time['activity_Remove/Cut'] / 
        sums_over_time['activity_Input'] 
        )
    del X_to_sum


    expr = {}
    for var in distribution_vars:
        expr[f"{var}_mean"] = (var, 'mean')
        expr[f"{var}_p50"] = (var, np.median)
        expr[f"{var}_stddev"] = (var, np.std)
    expr['preceding_pause_time_max'] = ('preceding_pause_time', 'max')
    expr['initial_pause_time_max'] = ('preceding_pause_time_start_window', 'max')
    expr["total_time"] = ('up_time', 'max')
    expr['is_time_beyond_expected_max'] = ('is_time_beyond_expected_max', 'max')

    distribution_summaries = X.groupby('id').agg(**expr)
    distribution_summaries['is_initial_pause_max_pause'] = (
        distribution_summaries['preceding_pause_time_max'] == 
        distribution_summaries['initial_pause_time_max']
        ).astype(int)


    aggregates_essay_text = aggregate_essay_text_features(X)


    # literature finds information in pauses' lognorm distribution
    mle_summary_subjects = []
    for X_subject in [x for _, x in X.groupby('id')]:

        subject_id = X_subject['id'].iloc[0]
        mle_by_var = {}
        for var in ['preceding_pause_time', 'latency_time']:
            shape, location, scale = lognorm.fit(X_subject[var].dropna())
            mle_by_var[f"{var}_lognorm_shape"] = shape
            mle_by_var[f"{var}_lognorm_location"] = location
            mle_by_var[f"{var}_lognorm_scale"] = scale

        mle_by_var = pd.DataFrame(mle_by_var, index=[subject_id])
        mle_by_var = mle_by_var.fillna(-1)

        mle_summary_subjects.append(mle_by_var)

    distr_params_over_time = pd.concat(mle_summary_subjects, axis=0)


    aggregates_over_time = pd.merge(
        sums_over_time, 
        distribution_summaries,
        how='left',
        left_index=True,
        right_index=True
        )

    aggregates_over_time = pd.merge(
        aggregates_over_time, 
        distr_params_over_time,
        how='left',
        left_index=True,
        right_index=True
        )
    
    aggregates_over_time = pd.merge(
        aggregates_over_time, 
        aggregates_essay_text,
        how='left',
        left_index=True,
        right_index=True
        )
    

    for var in event_vars_sum:

        aggregates_over_time[var + '_per_s'] = 1000*(
            (aggregates_over_time[var] / aggregates_over_time['total_time'])
            )

    aggregates_over_time = (
        aggregates_over_time
        .assign(
            keystroke_speed = lambda x: (x.activity_Input + x['activity_Remove/Cut']) / x.total_time,
            pause_time_fraction = lambda x: x.preceding_pause_time / x.total_time
            )
        )
    
    
    return aggregates_over_time

In [None]:
def aggregate_time_variability_measures(
    X, aggregates_over_time, is_training_run
    ):
    """
    Tabulate author's measures by fixed time window (ex: 30-second increments),
    and derive features from that by-time window distribution.

    Use over-time aggregates to normalize select by-time window tabulations. 
    """

    # need to sum events, conti vars by fixed-time window.
    # ensure a writer's fixed-time windows are all used -- drop excess ones.
    # for events, normalize by overall average event rates, & overall sums.
    # for conti var, normalize by overall sums.

    # then, over time windows, compute percentiles. this is novel for event vars,
    # which lack percentiles over all time. p90_time_window

    sums_by_window = (
        X
        .groupby(['id', 'window_30s'])
        [event_vars_sum + conti_vars_sum]
        .agg(sum)
        .astype(float)
        .fillna(0)
        .reset_index(drop=False)
    )
    sums_by_window['delete_insert_ratio'] = (
        sums_by_window['activity_Remove/Cut'] / 
        sums_by_window['activity_Input'] 
        ).replace(np.inf, np.nan)


    # by default, every categorical time window ever observed across data
    # tabulates for every writer. instead, per writer, truncate to time windows
    # actually used.
    sums_by_window['has_activity'] = (
        sums_by_window
        [['activity_' + x for x in ACTIVITY_CATEGORIES]].sum(axis=1) 
        > 0
    )
    sums_by_window['idx_window_by_id'] = (
        sums_by_window
        .groupby('id')
        .cumcount()
    )
    sums_by_window['idx_has_activity'] = np.where(
        sums_by_window['has_activity'], 
        sums_by_window['idx_window_by_id'],
        np.nan
        )
    sums_by_window['idx_activity_max'] = (
        sums_by_window
        .groupby(['id'])
        ['idx_has_activity']
        .transform('max')
    )
    sums_by_window = (
        sums_by_window
        .loc[sums_by_window['idx_window_by_id'] <= sums_by_window['idx_activity_max']]
        .drop(columns=['has_activity', 'idx_has_activity', 'idx_activity_max'])
    )


    # for variability measure more comparable between writers, de-mean by writer. 
    # Ex: higher-throughput writer incurs higher stddev, 
    # because values have higher magnitude.

    # join method allows for merge on one index column, of multiple possible
    sums_by_window = sums_by_window.join(
        aggregates_over_time[[x + '_per_s' for x in event_vars_sum]],
        on='id',
        how='left'
        )
    for var in event_vars_sum:
        sums_by_window[var + '_time_norm'] = (
            sums_by_window[var] / 
            (sums_by_window[var + '_per_s'].replace(0, None) * 30)
            ).fillna(1)
    sums_by_window.drop(
        columns=[x + '_per_s' for x in event_vars_sum],
        inplace=True
        )

    sums_over_time_ren = aggregates_over_time[event_vars_sum + conti_vars_sum]
    sums_over_time_ren.columns = [
        x + "_total" for x in sums_over_time_ren.columns
        ]
    sums_by_window = sums_by_window.join(sums_over_time_ren, on='id', how='left')
    for var in event_vars_sum + conti_vars_sum:
        sums_by_window[var + '_frac_total'] = (
            sums_by_window[var] / 
            sums_by_window[var + '_total'].replace(0, None)
            ).fillna(1)
    sums_by_window.drop(
        columns=[x + '_total' for x in event_vars_sum + conti_vars_sum],
        inplace=True
        )


    expr = {}
    distr_vars = (
        event_vars_sum
        + conti_vars_sum
        + [var + '_time_norm' for var in event_vars_sum]
        + [var + '_frac_total' for var in event_vars_sum]
        + [var + '_frac_total' for var in conti_vars_sum]
        )
    X_attributes = sums_by_window[['id']]
    X_to_sum = sums_by_window[distr_vars]
    if is_training_run:

        pipeline = ColumnTransformer(
            transformers=[(
                'discretizer', 
                preprocessing.KBinsDiscretizer(
                    n_bins=10, 
                    encode='onehot-dense', 
                    strategy='quantile'
                    ),
                distr_vars
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )

        # if nulls not explicitly handled, Exception raises
        pipeline.fit(X_to_sum.fillna(-1))
        with open("pipeline_time_dep_discretizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_time_dep_discretizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    # follow pipeline fit nulls treatment
    X_to_sum = pipeline.transform(X_to_sum.fillna(-1))

    X_to_sum = pd.DataFrame(X_to_sum, columns=pipeline.get_feature_names_out())
    X_to_sum = pd.concat([X_attributes, X_to_sum], axis=1)
    # cols_in = set(pipeline.feature_names_in_)
    # cols_out = set(pipeline.get_feature_names_out())
    # distribution_vars_discretized = list( cols_out.difference(cols_in) )

    # X_to_sum['nobs'] = 1
    # with distribution_vars discretized, everything sums
    distr_summaries = X_to_sum.groupby('id').agg(sum)
    # for var in distribution_vars_discretized:
    #     distr_summaries[var + '_share_distr'] = (
    #         distr_summaries[var] / distr_summaries['nobs']
    #     )
    # distr_summaries.drop(columns='nobs', inplace=True)
    distr_summaries.columns = [
        x + '_time_window' for x in distr_summaries.columns
        ]
    

    entropy_by_window = (
        sums_by_window
        .groupby(['id'])
        [[var for var in sums_by_window.columns if 'frac_total' in var]]
        .agg(lambda x: entropy(x.value_counts()))
        )
    entropy_by_window.columns = [
        x + '_entropy' 
        for x in entropy_by_window.columns
        ]


    trend_by_window = (
        sums_by_window
        .sort_values(['id', 'idx_window_by_id'])
        .drop(columns=['window_30s'])
        .groupby(['id'])
        [['idx_window_by_id'] + event_vars_sum + conti_vars_sum]
        .corr()
        )
    trend_by_window = trend_by_window.fillna(0)
    # extract correlations strictly with time index
    trend_by_window = trend_by_window.xs('idx_window_by_id', level=1)
    trend_by_window.columns = [x + "_ttrend" for x in trend_by_window.columns]


    vari_by_window = pd.merge(
        distr_summaries,
        entropy_by_window,
        how='left',
        left_index=True,
        right_index=True
        )


    vari_by_window = pd.merge(
        vari_by_window,
        trend_by_window,
        how='left',
        left_index=True,
        right_index=True
        )     
    
    
    return vari_by_window

In [None]:
def feature_transform_pipeline(X_logs, is_training_run):

    X_logs_enriched = enrich_logs(X_logs, is_training_run)

    vectorized_text = vectorize_essay_text(X_logs_enriched, is_training_run)

    vectorized_events = vectorize_events(X_logs_enriched, is_training_run)

    aggregates_over_time = aggregate_no_time_dependence_measures(
        X_logs_enriched, is_training_run
        )
    vari_by_window = aggregate_time_variability_measures(
        X_logs_enriched, aggregates_over_time, is_training_run
        )

    X_transform = pd.merge(
        aggregates_over_time,
        vari_by_window,
        how='left',
        left_index=True,
        right_index=True
        )

    X_transform = pd.merge(
        X_transform,
        vectorized_text,
        how='left',
        left_index=True,
        right_index=True
        )

    X_transform = pd.merge(
        X_transform,
        vectorized_events,
        how='left',
        left_index=True,
        right_index=True
        )

    return X_transform

In [None]:
# expect train_logs are too large for single batch processing
X_train_logs = extract(PATH_TRAIN_LOGS)

# X_train_logs_groups = [x for _, x in X_train_logs.groupby('id')]
# del X_train_logs

# X_train_logs_chunk1 = X_train_logs_groups[0:1200]
# X_train_logs_chunk2 = X_train_logs_groups[1200:]
# del X_train_logs_groups

# X_train_logs_chunk1 = pd.concat(X_train_logs_chunk1, axis=0)
# X_train_logs_chunk2 = pd.concat(X_train_logs_chunk2, axis=0).reset_index(drop=True)


In [None]:
# X_train_chunk1 = feature_transform_pipeline(X_train_logs_chunk1, True)
# del X_train_logs_chunk1

In [None]:
# rare train cases in chunk2 can yield new discretized bins versus chunk1,
# if pipeline re-trained
# X_train_chunk2 = feature_transform_pipeline(X_train_logs_chunk2, False)
# del X_train_logs_chunk2

In [None]:
# X_train = pd.concat([X_train_chunk1, X_train_chunk2], axis=0)
# del X_train_chunk1, X_train_chunk2

In [None]:
# would prefer not to chunk X_train_logs
X_train = feature_transform_pipeline(X_train_logs, True)
del X_train_logs

In [None]:
# can't learn from zero-variance features
has_zero_var_col = (X_train.std() == 0).to_dict()
has_zero_var_col = [
    x for x, has_zero_var in has_zero_var_col.items()
    if has_zero_var
    ]
X_train = X_train.drop(columns=has_zero_var_col)

In [None]:
assert(all(X_train.notnull()))

In [None]:
# persist consistent splits for different scripts' use
from sklearn.model_selection import train_test_split

y = pd.read_csv(PATH_TRAIN_OUTCOMES)
y = y.set_index("id")
y = y.rename(columns={"score": "y"})
XY = pd.merge(X_train, y, how="left", left_index=True, right_index=True)
y = XY["y"]
X = XY.drop(columns="y")

X, X_test, y, y_test = train_test_split(X, y, test_size=0.33, random_state=777)

In [None]:
if FEATURES_PRESELECTED:
    features_keep = FEATURES_PRESELECTED

else:

    # expect large universe of possible features --
    # then, optuna runs very slowly, model fitting generally is an issue.
    # that's besides concerns of noise features.
    # use random forest for feature selection.

    from sklearn.ensemble import RandomForestRegressor
    from sklearn.inspection import permutation_importance

    N_TOP_FEATURES_KEEP = 1000

    model = RandomForestRegressor(
        n_estimators=500,
        max_features="sqrt",
        max_depth=None,
    )
    model.fit(X, y.values)

    result = permutation_importance(model, X, y, n_repeats=5, n_jobs=-1)
    feature_imp = pd.DataFrame({
        'feature': X_test.columns,
        'score': result.importances_mean
        }).sort_values('score', ascending=False).reset_index(drop=True)

    feature_imp.to_csv("feature_selection_importances.csv", index=False)

    features_keep = feature_imp['feature'].iloc[0:N_TOP_FEATURES_KEEP]
    # features_keep = X.columns

In [None]:
X = X[features_keep]
X_test = X_test[features_keep]

In [None]:
X.to_pickle("./data/processed/X_train.pkl")
y.to_pickle("./data/processed/y_train.pkl")

X_test.to_pickle("./data/processed/X_test.pkl")
y_test.to_pickle("./data/processed/y_test.pkl")