In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
import pickle
import re
from scipy.stats import lognorm, skew, kurtosis, entropy
import json

In [None]:
MS_PER_S = 1000
PATH_TRAIN_LOGS = "./data/external/train_logs.csv"
PATH_TRAIN_OUTCOMES = "./data/external/train_scores.csv"

In [None]:
ACTIVITY_CATEGORIES = ['Nonproduction', 'Input', 'Remove/Cut', 'Replace', 'Paste', 'Move']

In [None]:
# might obtain this file via:
# json.dump(pipeline.vocabulary_, open('text_vectorizer_vocabulary.txt', 'w'))

# PRETRAINED_TEXT_VOCABULARY = None # if no pre-trained file
PRETRAINED_TEXT_VOCABULARY = json.load(open('text_vectorizer_vocabulary.txt'))

In [None]:
# json.dump(pipeline.vocabulary_, open('events_vectorizer_vocabulary.txt', 'w'))

# PRETRAINED_EVENTS_VOCABULARY = None # if no pre-trained file
PRETRAINED_EVENTS_VOCABULARY = json.load(open('events_vectorizer_vocabulary.txt'))

In [None]:
# with large vectorizer dictionaries, expedient to train offline and deploy
# FEATURES_PRESELECTED = None
FEATURES_PRESELECTED = [
    'word_count_delta',
    'n_characters',
    'activity_Input',
    'n_commas',
    'cursor_position_delta_2.0',
    'cursor_position_vs_max_4.0',
    'vocab0',
    'word_count_delta_per_s',
    'vocab11332',
    'word_count_delta_burst_thin_1.0',
    'activity_streak_length_thin_0.0',
    'latency_time_1.0',
    'keystroke_speed',
    'activity_Input_per_s',
    'n_thought_delimiting_punctuation',
    'vocab2064',
    'vocab6061',
    'vocab12884',
    'latency_time_3.0',
    'preceding_pause_time_0.0',
    'vocab9612',
    'vocab15673',
    'word_count_delta_activity_streak_thin_1.0',
    'latency_time_5.0',
    'vocab7894',
    'vocab14317',
    'latency_time_mean',
    'latency_time_0.0',
    'vocab1398',
    'latency_time_4.0',
    'event40',
    'preceding_pause_time_p50',
    'words_length_geq8_lt25_frac',
    'n_sentences_words_geq20_lt25',
    'vocab555',
    'vocab913',
    'vocab4085',
    'vocab11333',
    'latency_time_p50',
    'vocab1085',
    'vocab9613',
    'vocab4984',
    'vocab7895',
    'word_count_delta_frac_total_entropy',
    'words_length_mean',
    'n_sentences_words_geq30_lt50',
    'is_new_activity_streak_start_Input_per_s',
    'preceding_pause_time_lognorm_scale',
    'cursor_position_vs_max_2.0',
    'vocab1244',
    'n_sentences_words_geq25_lt30',
    'vocab9794',
    'vocab9966',
    'vocab6062',
    'word_count_delta_burst_thin_mean',
    'n_dashes',
    'is_new_activity_streak_start_Nonproduction_per_s',
    'n_sentences',
    'vocab736',
    'word_count_delta_activity_streak_thin_0.0',
    'vocab15835',
    'latency_time_lognorm_scale',
    'is_new_activity_streak_start_Input_frac_total_entropy',
    'words_length_stddev',
    'is_new_activity_streak_start_Remove/Cut_frac_total_entropy',
    'vocab16975',
    'vocab4086',
    'is_new_activity_streak_start_Nonproduction_frac_total_entropy',
    'vocab3133',
    'n_sentences_words_geq25_lt30_frac',
    'activity_Remove/Cut_per_s',
    'n_sentences_words_geq30_lt50_frac',
    'vocab370',
    'latency_time_2.0',
    'activity_Remove/Cut_frac_total_entropy',
    'vocab1544',
    'is_new_activity_streak_start_Nonproduction',
    'event39',
    'preceding_pause_time',
    'cursor_position_vs_max_3.0',
    'vocab6252',
    'activity_Input_frac_total_entropy',
    'vocab10291',
    'latency_time',
    'vocab2608',
    'vocab4820',
    'n_paragraphs',
    'vocab12885',
    'vocab2238',
    'vocab10442',
    'vocab5152',
    'n_apostrophe',
    'vocab1',
    'vocab13498',
    'vocab14318',
    'vocab4646',
    'n_sentences_words_geq5_lt10_frac',
    'words_per_thought_delimiting_punctuation_avg',
    'i_words_by_sentence_mean',
    'vocab14473',
    'vocab3294',
    'is_new_activity_streak_start_Replace_frac_total_entropy',
    'is_new_activity_streak_start_Input',
    'event9',
    'pause_time_fraction',
    'vocab15674',
    'event17',
    'cursor_position_vs_max_stddev',
    'vocab2420',
    'cursor_position_delta_stddev',
    'vocab11503',
    'activity_Remove/Cut',
    'vocab6602',
    'vocab2065',
    'vocab16004',
    'vocab11670',
    'vocab6429',
    'word_count_delta_burst_thin_p50',
    'vocab17907',
    'n_paragraphs_with_n_sentences_geq0_lt2_frac',
    'vocab4277',
    'latency_time_stddev',
    'latency_time_6.0',
    'word_count_delta_burst_thin_stddev',
    'vocab14775',
    'vocab12104',
    'vocab5587',
    'is_new_activity_streak_start_Remove/Cut',
    'i_words_by_sentence_stddev',
    'vocab10136',
    'cursor_position_vs_max_1.0',
    'vocab6909',
    'vocab10591',
    'words_length_geq4_lt5_frac',
    'n_sentences_words_geq15_lt20',
    'vocab8076',
    'vocab178',
    'activity_Replace_frac_total_entropy',
    'activity_Nonproduction_frac_total_entropy',
    'words_length_geq3_lt4_frac',
    'vocab3445',
    'cursor_position_vs_max_mean',
    'vocab13364',
    'i_words_by_sentence_p50',
    'vocab13208',
    'vocab943',
    'is_new_activity_streak_start_Replace_per_s',
    'vocab7925',
    'is_new_activity_streak_start_Remove/Cut_per_s',
    'n_sentences_words_geq10_lt15_frac',
    'vocab2963',
    'words_length_geq7_lt8_frac',
    'vocab7911',
    'vocab11970',
    'cursor_position_vs_max_0.0',
    'latency_time_lognorm_shape',
    'n_sentences_words_geq20_lt25_frac',
    'vocab13049',
    'cursor_position_delta_1.0',
    'vocab13620',
    'vocab8433',
    'vocab10292',
    'vocab8591',
    'vocab914',
    'vocab7205',
    'latency_time_frac_total_entropy',
    'word_count_delta_activity_streak_thin_stddev',
    'word_count_delta_activity_streak_thin_mean',
    'vocab12367',
    'n_paragraphs_with_n_sentences_geq0_lt2',
    'vocab2790',
    'latency_time_9.0',
    'vocab10728',
    'vocab5446',
    'vocab2609',
    'vocab14638',
    'preceding_pause_time_lognorm_location',
    'n_questions',
    'activity_streak_length_thin_mean',
    'vocab8741',
    'vocab3593',
    'preceding_pause_time_lognorm_shape',
    'words_length_geq0_lt2_frac',
    'n_quotes',
    'is_new_burst_start_Input_ttrend',
    'vocab5301',
    'activity_Replace_per_s',
    'vocab556',
    'vocab986',
    'vocab3134',
    'event26',
    'vocab6766',
    'vocab11824',
    'vocab8259',
    'is_new_activity_streak_start_Remove/Cut_ttrend',
    'vocab2421',
    'is_new_burst_start_Nonproduction_ttrend',
    'vocab4308',
    'vocab1086',
    'preceding_pause_time_mean',
    'vocab371',
    'activity_Remove/Cut_ttrend',
    'vocab7462',
    'vocab7060',
    'vocab7969',
    'latency_time_7.0',
    'activity_streak_length_thin_stddev',
    'n_sentences_words_geq15_lt20_frac',
    'word_count_delta_ttrend',
    'vocab4459',
    'activity_Replace',
    'vocab2791',
    'activity_Nonproduction',
    'preceding_pause_time_max',
    'is_new_activity_streak_start_Input_ttrend',
    'vocab8891',
    'delete_insert_ratio',
    'vocab12596',
    'vocab2623',
    'words_length_geq2_lt3_frac',
    'preceding_pause_time_ttrend',
    'vocab6767',
    'words_length_geq5_lt6_frac',
    'preceding_pause_time_stddev',
    'vocab9967',
    'is_new_activity_streak_start_Nonproduction_ttrend',
    'activity_Input_ttrend',
    'vocab4087',
    'is_new_burst_start_Nonproduction_frac_total_entropy',
    'words_length_geq6_lt7_frac',
    'cursor_position_delta_mean',
    'vocab6138',
    'vocab2111',
    'vocab15018',
    'activity_Nonproduction_per_s',
    'is_new_burst_start_Remove/Cut_per_s',
    'vocab2964',
    'vocab599',
    'is_new_burst_start_Remove/Cut_frac_total_entropy',
    'is_new_burst_start_ttrend',
    'preceding_pause_time_frac_total_entropy',
    'is_new_burst_start',
    'vocab737',
    'latency_time_ttrend',
    'vocab7984',
    'vocab15242',
    'vocab16144',
    'vocab6430',
    'vocab7333',
    'vocab8012',
    'vocab2066',
    'vocab4147',
    'vocab6603',
    'vocab8106',
    'is_new_activity_streak_start_Replace',
    'latency_time_8.0',
    'vocab10137',
    'vocab4115',
    'vocab4821',
    'vocab4190',
    'vocab2496',
    'is_new_burst_start_Input',
    'vocab5015',
    'vocab117',
    'n_sentences_words_geq5_lt10',
    'vocab6474',
    'is_new_burst_start_Input_per_s',
    'is_new_burst_start_Nonproduction_per_s',
    'is_new_burst_start_per_s',
    'vocab1412',
    'vocab9643',
    'event10',
    'vocab193',
    'vocab2467',
    'vocab226',
    'vocab17105',
    'vocab12489',
    'vocab270',
    'vocab13864',
    'vocab11404',
    'vocab16380',
    'vocab6092',
    'vocab16263',
    'vocab10969',
    'initial_pause_time_max',
    'n_paragraphs_with_n_sentences_geq6_lt7_frac',
    'vocab4130',
    'word_count_delta_burst_thin_0.0',
    'activity_streak_length_thin_p50',
    'activity_Nonproduction_ttrend',
    'latency_time_lognorm_location',
    'n_paragraphs_with_n_sentences_geq3_lt4_frac',
    'vocab6281',
    'is_new_burst_start_Input_frac_total_entropy',
    'vocab4460',
    'vocab7954',
    'vocab768',
    'vocab1245',
    'vocab2269',
    'vocab209',
    'cursor_position_delta_0.0',
    'vocab9996',
    'vocab9274',
    'vocab6253',
    'vocab10852',
    'total_time',
    'vocab14902',
    'n_paragraphs_with_n_sentences_geq5_lt6_frac',
    'vocab11347',
    'vocab4508',
    'vocab13758',
    'vocab8091',
    'vocab4476',
    'vocab7476',
    'vocab15139',
    'vocab958',
    'vocab4177',
    'vocab3149',
    'vocab9795',
    'vocab6617',
    'vocab2482',
    'vocab4647',
    'n_paragraphs_with_n_sentences_geq7_lt10',
    'vocab2096',
    'vocab2821',
    'vocab15431',
    'vocab9982',
    'vocab11377',
    'vocab4690',
    'is_new_burst_start_frac_total_entropy',
    'vocab3837',
    'is_new_burst_start_Nonproduction',
    'vocab9686',
    'vocab4101',
    'vocab7896',
    'is_new_burst_start_Remove/Cut_ttrend',
    'vocab387',
    'vocab4353',
    'vocab4324',
    'vocab9853',
    'vocab929',
    'vocab9658',
    'n_sentences_words_geq50_lt5000_frac',
    'vocab1545',
    'vocab4536',
    'vocab2180',
    'vocab2511',
    'vocab972',
    'n_paragraphs_with_n_sentences_geq4_lt5_frac',
    'vocab2638',
    'vocab431',
    'vocab9672',
    'vocab445',
    'vocab4662',
    'vocab4218',
    'vocab6063',
    'vocab4163',
    'vocab4706',
    'vocab10023',
    'vocab4491',
    'vocab11534',
    'vocab3295',
    'vocab9700',
    'vocab13116',
    'is_new_burst_start_Remove/Cut',
    'vocab12105',
    'vocab12245',
    'vocab6077',
    'vocab2423',
    'vocab4835',
    'vocab8121',
    'vocab570',
    'n_paragraphs_with_n_sentences_geq5_lt6',
    'vocab753',
    'vocab1259',
    'n_sentences_words_geq10_lt15',
    'vocab17243',
    'vocab2081',
    'vocab299',
    'vocab2849',
    'vocab7090',
    'vocab9628',
    'vocab1399',
    'vocab4278',
    'vocab12901',
    'vocab241',
    'vocab4205',
    'vocab9714',
    'vocab2297',
    'n_paragraphs_with_n_sentences_geq3_lt4',
    'vocab89',
    'n_paragraphs_with_n_sentences_geq6_lt7',
    'vocab8135',
    'vocab1001',
    'vocab13972',
    'vocab2525',
    'vocab4523',
    'vocab7926',
    'vocab7940',
    'vocab1315',
    'vocab6369',
    'vocab6268',
    'vocab15850',
    'vocab4592',
    'vocab1559',
    'vocab12368',
    'vocab402',
    'vocab6154',
    'vocab2835',
    'activity_Replace_ttrend',
    'vocab11390',
    'vocab5029',
    'vocab16976',
    'vocab74',
    'vocab1289',
    'vocab3204',
    'vocab8276',
    'vocab14639',
    'vocab2652',
    'n_paragraphs_with_n_sentences_geq7_lt10_frac',
    'vocab9839',
    'vocab2437',
    'vocab9824',
    'vocab417',
    'vocab1303',
    'vocab16513',
    'vocab179',
    'vocab13499',
    'vocab11363',
    'vocab13223',
    'vocab6311',
    'vocab6780',
    'vocab1115',
    'vocab5601',
    'vocab6445',
    'vocab59',
    'vocab811',
    'vocab2452',
    'event3',
    'vocab4294',
    'vocab8077',
    'vocab8290',
    'vocab6108',
    'vocab1101',
    'vocab2283',
    'vocab8349',
    'vocab7998',
    'vocab2667',
    'vocab11839',
    'vocab459',
    'vocab10167',
    'vocab18794',
    'vocab6489',
    'vocab10306',
    'vocab9027',
    'vocab5830',
    'is_new_activity_streak_start_Replace_ttrend',
    'vocab2254',
    'vocab229',
    'vocab4461',
    'n_paragraphs_with_n_sentences_geq4_lt5',
    'vocab5302',
    'vocab6558',
    'vocab20',
    'vocab15689',
    'event41',
    'vocab9728',
    'vocab5167',
    'vocab1144',
    'vocab13238',
    'n_sentences_words_geq0_lt5_frac',
    'vocab11420',
    'vocab16613',
    'vocab4879',
    'vocab32',
    'vocab2167',
    'vocab5000',
    'vocab7076',
    'vocab16701',
    'vocab11714',
    'vocab2239',
    'vocab14776',
    'vocab1247',
    'vocab11825',
    'vocab4367',
    'vocab2807',
    'vocab671',
    'event4',
    'vocab255',
    'vocab2126',
    'vocab1587',
    'vocab8742',
    'vocab8892',
    'vocab6169',
    'vocab45',
    'vocab6123',
    'vocab2612',
    'vocab11083',
    'vocab4479',
    'vocab16044',
    'vocab17908',
    'vocab4719',
    'vocab6459',
    'vocab16032',
    'vocab11334',
    'vocab8335',
    'vocab2',
    'vocab8149',
    'vocab614',
    'vocab2994',
    'vocab2979',
    'vocab628',
    'vocab210',
    'vocab6794',
    'vocab4551',
    'vocab4850',
    'vocab6340',
    'vocab769',
    'vocab4892',
    'vocab10443',
    'vocab5044',
    'vocab14558',
    'vocab8606',
    'vocab16145',
    'vocab2311',
    'n_paragraphs_with_n_sentences_geq10_lt20_frac',
    'vocab2440',
    'vocab10457',
    'vocab5588',
    'vocab10592',
    'vocab4116',
    'vocab3036',
    'vocab10321',
    'vocab5724',
    'vocab8163',
    'vocab6910',
    'vocab14346',
    'vocab4339',
    'vocab212',
    'vocab10970',
    'vocab11671',
    'vocab4164',
    'vocab1015',
    'vocab3729',
    'vocab585',
    'vocab12915',
    'vocab3191',
    'vocab9644',
    'vocab8304',
    'vocab227',
    'vocab4325',
    'vocab9614',
    'vocab8107',
    'vocab4131',
    'vocab2966',
    'vocab16381',
    'vocab1574',
    'vocab9741',
    'vocab17',
    'vocab374',
    'vocab4492',
    'vocab782',
    'vocab15865',
    'vocab6923',
    'vocab7206',
    'vocab6093',
    'n_paragraphs_with_n_sentences_geq2_lt3_frac',
    'vocab375',
    'vocab1157',
    'vocab658',
    'vocab3624',
    'vocab3050',
    'vocab6296',
    'vocab14332',
    'vocab10010',
    'vocab256',
    'vocab4732',
    'is_time_beyond_expected_max',
    'vocab3165',
    'vocab3062',
    'vocab5330',
    'vocab10153',
    'vocab1129',
    'vocab4118',
    'vocab9809',
    'vocab15718',
    'vocab2270',
    'vocab6198',
    'vocab2340',
    'vocab15704',
    'vocab8260',
    'vocab6632',
    'vocab4665',
    'vocab1688',
    'vocab4280',
    'vocab11433',
    'vocab6432',
    'vocab4463',
    'vocab13142',
    'vocab14652',
    'vocab3594',
    'vocab13050',
    'vocab6460',
    'vocab4676',
    'vocab3310',
    'vocab11686',
    'vocab4104',
    'n_paragraphs_with_n_sentences_geq10_lt20',
    'vocab12929',
    'vocab4988',
    'is_new_activity_streak_start_Input_frac_total_0.0_time_window',
    'vocab6977',
    'vocab8437',
    'vocab376',
    'preceding_pause_time_0.0_time_window',
    'vocab18018',
    'vocab2865',
    'vocab4493',
    'vocab4985',
    'vocab5343',
    'vocab272',
    'vocab7245',
    'vocab559',
    'vocab8756',
    'vocab284',
    'vocab11985',
    'vocab2639',
    'vocab103',
    'vocab2140',
    'vocab1274',
    'vocab797',
    'vocab2682',
    'vocab10036',
    'vocab33',
    'vocab3609',
    'vocab8203',
    'is_new_burst_start_Input_frac_total_0.0_time_window',
    'vocab4117',
    'vocab8592',
    'vocab8178',
    'vocab5072',
    'vocab18156',
    'vocab14489',
    'vocab373',
    'vocab242',
    'vocab3178',
    'vocab392',
    'vocab2453',
    'n_semicolons',
    'vocab13404',
    'vocab3324',
    'vocab5499',
    'vocab4354',
    'vocab14790',
    'is_new_activity_streak_start_Input_0.0_time_window',
    'n_sentences_words_geq50_lt5000',
    'vocab12956',
    'n_parenthetical_punctuation',
    'vocab4565',
    'vocab9646',
    'vocab10361',
    'vocab11548',
    'vocab372',
    'vocab473',
    'vocab11612',
    'vocab4103',
    'vocab9880',
    'vocab2454',
    'vocab13267',
    'vocab403',
    'vocab11601',
    'vocab4119',
    'vocab2425',
    'vocab9634',
    'vocab14360',
    'vocab3136',
    'vocab586',
    'vocab19293',
    'vocab2552',
    'vocab13378',
    'vocab8434',
    'vocab9157',
    'event5',
    'vocab12943',
    'vocab486',
    'activity_Input_frac_total_0.0_time_window',
    'vocab62',
    'vocab13279',
    'vocab3474',
    'vocab243',
    'vocab15796',
    'vocab16018',
    'vocab406',
    'vocab5221',
    'vocab10335',
    'n_paragraphs_with_n_sentences_geq2_lt3',
    'vocab213',
    'vocab432',
    'vocab11971',
    'vocab6674',
    'vocab4132',
    'vocab6503',
    'vocab13063',
    'vocab2610',
    'vocab4297',
    'vocab194',
    'vocab4679',
    'vocab2325',
    'vocab3022',
    'vocab10181',
    'vocab10619',
    'vocab418',
    'vocab389',
    'vocab739',
    'is_new_activity_streak_start_Nonproduction_0.0_time_window',
    'vocab14066',
    'vocab8263',
    'vocab15759',
    'vocab13635',
    'vocab4105',
    'vocab13076',
    'vocab8293',
    'vocab6095',
    'activity_Input_time_norm_0.0_time_window',
    'vocab2271',
    'vocab14388',
    'vocab4309',
    'vocab2084',
    'vocab214',
    'vocab8491',
    'vocab15920',
    'vocab4381',
    'vocab6325',
    'vocab5181',
    'vocab11520',
    'activity_Remove/Cut_time_norm_5.0_time_window',
    'vocab1208',
    'vocab211',
    'vocab9631',
    'vocab77',
    'vocab6937',
    'vocab7334',
    'vocab3446',
    'vocab1426',
    'vocab4311',
    'vocab754',
    'latency_time_1.0_time_window',
    'vocab230',
    'vocab12886',
    'activity_Remove/Cut_time_norm_0.0_time_window',
    'vocab16264',
    'vocab14374',
    'vocab5058',
    'vocab2469',
    'vocab12117',
    'vocab6436',
    'vocab946',
    'vocab1600',
    'is_new_activity_streak_start_Nonproduction_time_norm_0.0_time_window',
    'vocab15836',
    'vocab4312',
    'vocab15878',
    'vocab9673',
    'vocab6341',
    'vocab1466',
    'vocab13391',
    'vocab11701',
    'vocab2438',
    'vocab2468',
    'vocab405',
    'vocab257',
    'latency_time_frac_total_5.0_time_window',
    'vocab8108',
    'vocab420',
    'vocab10983',
    'vocab12131',
    'vocab2097',
    'is_new_burst_start_0.0_time_window',
    'vocab8464',
    'event2',
    'vocab6433',
    'vocab2697',
    'vocab181',
    'vocab3730',
    'vocab7061',
    'vocab2611',
    'vocab684',
    'vocab434',
    'vocab10207',
    'vocab9893',
    'vocab14516',
    'vocab1440',
    'preceding_pause_time_1.0_time_window',
    'vocab183',
    'vocab4178',
    'vocab4823',
    'vocab2424',
    'vocab2654',
    'vocab6282',
    'vocab9028',
    'vocab2483',
    'is_new_activity_streak_start_Nonproduction_1.0_time_window',
    'vocab4102',
    'is_new_burst_start_time_norm_0.0_time_window',
    'vocab2967',
    'vocab561',
    'vocab9630',
    'vocab1265',
    'vocab4945',
    'vocab5316',
    'vocab6619',
    'vocab9867',
    'vocab6447',
    'vocab6950',
    'vocab3152',
    'vocab4092',
    'vocab1611',
    'vocab421',
    'vocab6647',
    'vocab2794',
    'vocab4865',
    'vocab587',
    'vocab10742',
    'vocab2298',
    'vocab4465',
    'vocab9675',
    'vocab6080',
    'vocab7232',
    'is_new_burst_start_frac_total_0.0_time_window',
    'vocab560',
    'vocab4466',
    'vocab4089',
    'vocab5153',
    'vocab2455',
    'vocab9376',
    'is_new_activity_streak_start_Input_time_norm_0.0_time_window',
    'vocab198',
    'vocab1340',
    'word_count_delta_3.0_time_window',
    'vocab313',
    'latency_time_2.0_time_window',
    'vocab2284',
    'latency_time_frac_total_6.0_time_window',
    'is_new_burst_start_frac_total_4.0_time_window',
    'vocab2422',
    'vocab14666',
    'vocab944',
    'vocab4824',
    'vocab4342',
    'vocab8449',
    'vocab2083',
    'vocab2274',
    'vocab558',
    'vocab14502',
    'activity_Remove/Cut_time_norm_6.0_time_window',
    'vocab5460',
    'vocab4150',
    'event15',
    'vocab4298',
    'vocab10606',
    'vocab10091',
    'vocab4478',
    'vocab2287',
    'vocab6112',
    'vocab4987',
    'vocab4509',
    'vocab3205',
    'vocab2497',
    'vocab6272',
    'vocab2824',
    'vocab2427',
    'is_new_activity_streak_start_Input_2.0_time_window',
    'n_sentences_words_geq0_lt5',
    'preceding_pause_time_frac_total_2.0_time_window',
    'vocab11867',
    'vocab9040',
    'vocab2428',
    'vocab2241',
    'activity_Replace_time_norm_1.0_time_window',
    'vocab8648',
    'vocab4166',
    'is_new_burst_start_Remove/Cut_time_norm_1.0_time_window',
    'vocab11563',
    'vocab407',
    'vocab6354',
    'vocab13103',
    'vocab7941',
    'vocab4477',
    'vocab6448',
    'vocab6141',
    'vocab2112',
    'activity_Remove/Cut_3.0_time_window',
    'vocab3008',
    'vocab4494',
    'cursor_position_vs_max_p50',
    'vocab2461',
    'vocab10445',
    'vocab6096',
    'vocab390',
    'vocab11445',
    'vocab4329',
    'vocab10866',
    'vocab2981',
    'vocab4866',
    'vocab15019',
    'is_new_burst_start_Nonproduction_0.0_time_window',
    'vocab2796',
    'vocab2456',
    'vocab9632',
    'vocab2353',
    'vocab9920',
    'vocab13209',
    'vocab6822',
    'is_new_activity_streak_start_Paste_time_norm_1.0_time_window',
    'vocab15675',
    'vocab4281',
    'vocab9826',
    'vocab9972',
    'vocab182',
    'vocab2485',
    'vocab4310',
    'vocab4395',
    'activity_Nonproduction_frac_total_4.0_time_window',
    'vocab5447',
    'vocab35',
    'vocab197',
    'words_length_p50',
    'vocab6300',
    'vocab14804',
    'vocab2850',
    'vocab6809',
    'activity_Nonproduction_0.0_time_window',
    'vocab784',
    'vocab2155',
    'vocab4650',
    'vocab4648',
    'is_new_burst_start_Replace_time_norm_1.0_time_window',
    'latency_time_frac_total_1.0_time_window',
    'vocab2970',
    'is_new_burst_start_Move_0.0_time_window',
    'vocab4088',
    'vocab3511',
    'vocab2099',
    'vocab7915',
    'vocab2982',
    'vocab314',
    'vocab2640',
    'latency_time_frac_total_2.0_time_window',
    'vocab8136',
    'is_new_burst_start_Paste_0.0_time_window',
    'is_new_activity_streak_start_Remove/Cut_frac_total_5.0_time_window',
    'vocab8291',
    'vocab2272',
    'vocab6067',
    'vocab2811',
    'vocab11853',
    'vocab2285',
    'vocab447',
    'vocab3192',
    'is_new_burst_start_Nonproduction_1.0_time_window',
    'vocab15932',
    'latency_time_6.0_time_window',
    'vocab4462',
    'vocab4495',
    'vocab9275',
    'event13',
    'vocab3351',
    'vocab9645',
    'vocab10037',
    'vocab7970',
    'vocab2526',
    'vocab4120',
    'vocab1039',
    'vocab4148',
    'vocab5195',
    'vocab6461',
    'vocab185',
    'vocab6475',
    'vocab5725',
    'vocab271',
    'vocab2642',
    'vocab8906',
    'is_new_activity_streak_start_Paste_frac_total_0.0_time_window',
    'activity_Nonproduction_frac_total_0.0_time_window',
    'vocab3138',
    'is_new_activity_streak_start_Remove/Cut_frac_total_0.0_time_window',
    'vocab8621',
    'vocab8025',
    'vocab6124',
    'vocab6283',
    'word_count_delta_1.0_time_window',
    'vocab2539',
    'vocab8477',
    'vocab14903',
    'vocab13621',
    'vocab5474',
    'vocab4165',
    'vocab799',
    'vocab4851',
    'vocab2273'
 ]

In [None]:
def extract(path):

    X = pd.read_csv(path)
    X = X.sort_values(["id", "event_id"], ascending=[True, True])
    
    return X

In [None]:
def enrich_activity(X, is_training_run):

    # 'Move From' activity recorded with low-level cursor loc details
    # extract bigger-picture 'Move From'
    # QUESTION: what's the difference between Move From, and a cut+paste?
    X['activity_detailed'] = X['activity']
    X.loc[X['activity'].str.contains('Move From'), 'activity'] = 'Move'

    if is_training_run:

        pipeline = ColumnTransformer(
            transformers=[(
                'onehot_encode', 
                preprocessing.OneHotEncoder(
                    categories=[ACTIVITY_CATEGORIES], 
                    sparse=False, 
                    handle_unknown='infrequent_if_exist'
                    ),
                ["activity"]
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )
        
        pipeline.fit(X)

        with open("pipeline_activity_onehot.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_activity_onehot.pkl", "rb") as f:
            pipeline = pickle.load(f)

    original_categorical = X[['activity']]

    X_dtypes = X.dtypes.to_dict()
    X = pipeline.transform(X)
    X = pd.DataFrame(X, columns=pipeline.get_feature_names_out())
    X = pd.concat([X, original_categorical], axis=1)
    X = X.astype(X_dtypes)

    return X

In [None]:
def scrub_text_change(X):
    """
    Problems with initial text data:

    - Some hex expressions (\\xHH) not decoded. Instead, written literally.
        - Examples: emdash (\\x96), slanted quotations & ticks.
        
    - Some foreign characters (accent a, overring a) not anonymized with generic q.
    Problem confirmed via Kaggle data viewer, for id-event_id cases like 
    0916cdad-39 or 9f328eb3-19. Solutions:
        - An Input event cannot include multiple characters: 
        foreign character & something else. 
        Then, 
            - If Input event contains any emdash, overwrite as strictly emdash
            - If Input event contains no emdash & foreign character, overwrite with single q
            - If Move event, replace any foreign character with single q
    """

    X['text_change_original'] = X['text_change']

    # expect this transforms all \xHH literals
    X['text_change'] = (
        X
        ['text_change_original']
        # arrived at utf-8 encode, windows-1252 decode after several iterations.
        # tested latin-1, but not all \xHH instances caught.
        # tested utf-16, just rose errors.
        .apply(lambda x: x.encode(encoding='utf-8').decode("windows-1252"))
    )


    is_text_change_decode_english = (
        X['text_change'].apply(lambda x: x.isascii())
    )

    is_input_event_foreign_any_emdash = (
        (~ is_text_change_decode_english)
        & (X['activity'] == "Input") 
        & (X['text_change'].str.contains("—"))
    )
    X.loc[is_input_event_foreign_any_emdash, 'text_change'] = "—"

    is_input_event_foreign_no_overwrite = (
        (~ is_text_change_decode_english)
        & (X['activity'] == "Input")
        & (~ X['text_change'].str.contains("—"))
    )
    X.loc[is_input_event_foreign_no_overwrite, 'text_change'] = 'q'


    # given block text change, proceed one character at a time,
    # replacing foreign ones 
    def anonymize_non_ascii(x):
        value = ""
        for x_i in x:
            if not x_i.isascii():
                value += "q"
            else:
                value += x_i
        return value

    X['text_change'] = np.where(
        X['activity'].str.contains('Move|Remove|Paste|Replace', regex=True),
        X['text_change'].apply(lambda x: anonymize_non_ascii(x)),
        X['text_change']
    )

    X.drop(columns='text_change_original', inplace=True)

    return X

In [None]:
PAUSE_THRESHOLD_MS = 1000
N_ACTIVITIES_UNTIL_START_WINDOW_CLOSES = 100

def enrich_pauses(X):
    """
    Must infer pauses, as no explicit record indicates.
    'Latency' implies, any time delta between keystrokes.
    'Pause' implies, a 'significant' time delta, not just physical-mechanical
    requirement of typing.
    """

    X['up_time_lag1'] = X.groupby(['id'])['up_time'].shift(1)
    X['latency_time'] = X['down_time'] - X['up_time_lag1']

    X['preceding_pause_time'] = X['latency_time']
    # first record lacks preceding_pause_time (time before first key press)
    X.loc[X['event_id'] == 1, 'preceding_pause_time'] = X['down_time']
    # expect some negative pause times -- interpret as, no real pause
    has_no_real_pause = X['preceding_pause_time'] <= PAUSE_THRESHOLD_MS
    X.loc[has_no_real_pause, 'preceding_pause_time'] = None

    # not obvious how to tag "initial planning pause" 
    # tried "first 5 minutes", but when that pause is 10 minutes, that fails.
    # first XX minutes is fragile
    # first XX events may help -- what's your extent of pause before *action*?
    X['preceding_pause_time_start_window'] = X['preceding_pause_time']
    X.loc[
        X['event_id'] <= N_ACTIVITIES_UNTIL_START_WINDOW_CLOSES, 
        'preceding_pause_time_start_window'
        ] = None

    X['total_pause_time'] = (
        X
        .groupby(['id'])
        ['preceding_pause_time']
        .transform('sum')
        )
    X['rolling_pause_time'] = (
        X
        .groupby(['id'])
        ['preceding_pause_time']
        .cumsum()
        )
    X['rolling_pause_time_fraction'] = (
        X['rolling_pause_time'] / X['total_pause_time']
        )

    return X

In [None]:
SECONDS_PER_BURST = 2

def enrich_time_bursts(X, is_training_run):
    """
    If pause exceeds threshold duration, a "burst" has ended. 
    A burst is characterized by one dominant activity.
    """

    X['is_new_burst_start'] = (
        X['preceding_pause_time'] > MS_PER_S * SECONDS_PER_BURST
        ).astype(int)
    X.loc[X['event_id'] == 1, 'is_new_burst_start'] = 1
    X['burst_id'] = (
        X
        .groupby(['id'])
        ['is_new_burst_start']
        .cumsum()
        )
    X['burst_time_start'] = (
        X
        .groupby(['id', 'burst_id'])
        ['down_time']
        .transform('min')
        )
    X['burst_time_end'] = (
        X
        .groupby(['id', 'burst_id'])
        ['up_time']
        .transform('max')
        )
    X['burst_time_duration'] = X['burst_time_end'] - X['burst_time_start']
    

    for activity in ACTIVITY_CATEGORIES:

        X['burst_events_' + activity] = (
            X
            .groupby(['id', 'burst_id'])
            ['activity_' + activity]
            .transform('sum')
            ).astype(float)
        
    X['burst_type'] = (
        X
        [['burst_events_' + activity for activity in ACTIVITY_CATEGORIES]]
        .idxmax(axis=1)
        )
    X['burst_type'] = X['burst_type'].str.replace(
        "burst_events_", "", regex=True
        )


    if is_training_run:
        
        pipeline = ColumnTransformer(
            transformers=[(
                'onehot_encode', 
                preprocessing.OneHotEncoder(
                    categories=[ACTIVITY_CATEGORIES], 
                    sparse=False, 
                    handle_unknown='infrequent_if_exist'
                    ),
                ["burst_type"]
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )
        
        pipeline.fit(X)
        
        with open("pipeline_burst_type_onehot.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_burst_type_onehot.pkl", "rb") as f:
            pipeline = pickle.load(f)

    original_categorical = X['burst_type']
    X_dtypes = X.dtypes.to_dict()
    X = pipeline.transform(X)
    X = pd.DataFrame(X, columns=pipeline.get_feature_names_out())
    X = pd.concat([X, original_categorical], axis=1)
    X = X.astype(X_dtypes)

    for activity in ACTIVITY_CATEGORIES:
        X['is_new_burst_start_' + activity] = (
            X['is_new_burst_start'] * 
            X['burst_type_' + activity]
            )
    
    return X

In [None]:
def enrich_activity_streaks(X):
    """
    Consecutive activity (independent of time) suggests productive writing flow 
    """

    X['activity_lag1'] = X.groupby(['id'])['activity'].shift(1)

    X['is_new_activity_streak_start'] = (
        X['activity'] != X['activity_lag1']
        ).astype(int)
    X.loc[X['event_id'] == 1, 'is_new_activity_streak_start'] = 1

    X['is_activity_streak_end'] = (
        X
        .groupby(['id'])
        ['is_new_activity_streak_start']
        .shift(-1)
        )
    X['is_activity_streak_end'] = X['is_activity_streak_end'].fillna(1) 

    X['activity_streak_id'] = (
        X
        .groupby(['id'])
        ['is_new_activity_streak_start']
        .cumsum()
    )

    X['activity_streak_length_thin'] = (
        X
        .groupby(['id', 'activity_streak_id'])
        .transform('size')
    )
    X.loc[
        X['is_activity_streak_end'] == 0, 
        'activity_streak_length_thin'
        ] = None

    for activity in ACTIVITY_CATEGORIES:
        X['is_new_activity_streak_start_' + activity] = (
            X["activity_" + activity] * X['is_new_activity_streak_start']
            )

    return X

In [None]:
def enrich_word_count(X):
    """
    Word count is a primary productivity measure. 
    Expect score to increase with word count.
    """

    X['word_count_lag1'] = X.groupby(['id'])['word_count'].shift(1)
    X['word_count_delta'] = X['word_count'] - X['word_count_lag1']

    X['word_count_delta_burst'] = (
        X
        .groupby(['id', 'burst_id'])
        ['word_count_delta']
        .transform('sum')
        )
    # de-duplication allows easier downstream aggregation
    X['word_count_delta_burst_thin'] = X['word_count_delta_burst']
    X.loc[X['is_new_burst_start'] == 0, 'word_count_delta_burst_thin'] = None

    X['word_count_delta_activity_streak'] = (
        X
        .groupby(['id', 'activity_streak_id'])
        ['word_count_delta']
        .transform('sum')
        )
    # de-duplicate to one value per burst -- easier for downstream aggregation
    X['word_count_delta_activity_streak_thin'] = X['word_count_delta_activity_streak']
    X.loc[
        X['is_new_activity_streak_start'] == 0, 
        'word_count_delta_activity_streak_thin'
        ] = None


    return X

In [None]:
def enrich_cursor_position(X):
    """
    Theory: one-way cursor movement might be more productive, vs jumping around.
    """

    X['cursor_position_lag1'] = (
        X
        .groupby(['id'])
        ['cursor_position']
        .shift(1)
        ).fillna(0)
    X['cursor_position_delta'] = X['cursor_position'] - X['cursor_position_lag1'] 

    # if cursor position increases due to copy+paste (perhaps of essay prompt),
    # that doesn't reflect grade-driving output
    X['cursor_position_input'] = np.where(
        X['activity'] == "Input", 
        X["cursor_position"], 
        np.nan
        )
    X['cursor_position_cummax'] = X.groupby(['id'])['cursor_position_input'].cummax()

    # for some reason, unable to chain below statements with above
    X['cursor_position_cummax'] = (
        X
        .groupby(['id'])
        ['cursor_position_cummax']
        .ffill()
        .fillna(0)
    )

    X['cursor_position_vs_max'] = (
        X['cursor_position'] - X['cursor_position_cummax']
        )

    X = X.drop(columns='cursor_position_input')

    return X

In [None]:
TOTAL_MIN_MAX_EXPECTED = 30
TOTAL_MIN_PLUS_BUFFER = 150 # id 21bbc3f6 case extended to 140 min ... odd
SECONDS_PER_MIN = 60
SECONDS_PER_WINDOW = 30

def enrich_time_windows(X):

    # windows allow for time-sequence features
    # expect that some essays extend beyond 30 min described in 'Data Collection'
    # downstream, **do not tabulate over a writer's unused time windows**!!

    X['window_30s'] = pd.cut(
        X['down_time'],
        bins=np.arange(
            0, 
            TOTAL_MIN_PLUS_BUFFER * SECONDS_PER_MIN * MS_PER_S, 
            SECONDS_PER_WINDOW * MS_PER_S
            )
        )

    X['is_time_beyond_expected_max'] = (
        X['up_time'] > TOTAL_MIN_MAX_EXPECTED * SECONDS_PER_MIN * MS_PER_S
    ).astype(int)

    return X

In [None]:
def subset_features(X):

    return X[[
        "id",
        "event_id",
        "is_time_beyond_expected_max",
        "window_30s",
        "burst_id",
        "burst_type",
        "burst_type_Nonproduction",
        "burst_type_Input",
        "burst_type_Remove/Cut",
        "burst_type_Replace",
        "burst_type_Paste",
        "burst_type_Move",
        "is_new_burst_start",
        "is_new_burst_start_Nonproduction",
        "is_new_burst_start_Input",
        "is_new_burst_start_Remove/Cut",
        "is_new_burst_start_Replace",
        "is_new_burst_start_Paste",
        "is_new_burst_start_Move",
        "burst_time_start",
        "burst_time_end",
        "burst_time_duration",
        "burst_events_Nonproduction",
        "burst_events_Input",
        "burst_events_Remove/Cut",
        "burst_events_Replace",
        "burst_events_Paste",
        "burst_events_Move",
        "word_count_delta_burst",
        "word_count_delta_burst_thin",
        "activity_streak_id",
        "is_new_activity_streak_start",
        "is_new_activity_streak_start_Nonproduction",
        "is_new_activity_streak_start_Input",
        "is_new_activity_streak_start_Remove/Cut",
        "is_new_activity_streak_start_Replace",
        "is_new_activity_streak_start_Paste",
        "is_new_activity_streak_start_Move",
        "is_activity_streak_end",
        "activity_streak_length_thin",
        "word_count_delta_activity_streak",
        "word_count_delta_activity_streak_thin",

        "down_time",
        "up_time",	
        "action_time",	
        "activity_detailed",
        "activity",	
        "activity_Nonproduction",
        "activity_Input",
        "activity_Remove/Cut",
        "activity_Replace",
        "activity_Paste",
        "activity_Move",
        "down_event",	
        "up_event",	
        "text_change",
        "cursor_position",	
        "word_count",

        "cursor_position_delta",
        "cursor_position_vs_max",
        "cursor_position_cummax",

        "word_count_lag1",
        "word_count_delta",

        "up_time_lag1",
        "latency_time",
        "preceding_pause_time",
        "preceding_pause_time_start_window",
        "rolling_pause_time",
        "rolling_pause_time_fraction",
        "total_pause_time"
        ]]  

In [None]:
def concatenate_essay_from_logs(df):
    """
    Concatenate essay text from disparate logged input events.
    Expect df to be *one* author's log.
    Adapted from sources: 
        https://www.kaggle.com/code/hiarsl/feature-engineering-sentence-paragraph-features,
        https://www.kaggle.com/code/kawaiicoderuwu/essay-contructor.
    """

    input_events = df.loc[
        (df.activity != 'Nonproduction'), 
        ['activity_detailed', 'cursor_position', 'text_change']
        ].rename(columns={'activity_detailed': 'activity'})

    essay_text = ""
    for input_event in input_events.values:

        activity = input_event[0]
        cursor_position_after_event = input_event[1]
        text_change_log = input_event[2]

        if activity == 'Replace':

            replace_from_to = text_change_log.split(' => ')
            text_add = replace_from_to[1]
            text_remove = replace_from_to[0]
            cursor_position_start_text_change = (
                cursor_position_after_event - len(text_add)
                )
            cursor_position_after_skip_replace = (
                cursor_position_start_text_change + len(text_remove)
            )

            # essayText start: "the blue cat"
            # replace "blue" with "red"
            # "the redblue cat", skip blue
            essay_text = (
                essay_text[:cursor_position_start_text_change] # "the "
                + text_add # "red"
                # essayText value: "the blue cat" 
                # want remaining " cat", NOT "blue cat"
                + essay_text[cursor_position_after_skip_replace:] 
                )

            continue

        if activity == 'Paste':

            cursor_position_start_text_change = (
                cursor_position_after_event - len(text_change_log)
                )

            # essayText start: "the cat"
            # paste "blue " between
            essay_text = (
                essay_text[:cursor_position_start_text_change] # "the " 
                + text_change_log # "blue "
                # essayText value: "the cat"
                + essay_text[cursor_position_start_text_change:]
            )

            continue

        if activity == 'Remove/Cut':
            # similar process to "Replace" action

            text_remove = text_change_log
            cursor_position_after_skip_remove = (
                cursor_position_after_event + len(text_remove)
            )

            essay_text = (
                essay_text[:cursor_position_after_event] 
                + essay_text[cursor_position_after_skip_remove:]
                )

            continue
        
        if "Move" in activity:

            cursor_intervals_raw_str = (
                activity[10:]
                .replace("[", "")
                .replace("]", "")
                )
            cursor_intervals_separate = cursor_intervals_raw_str.split(' To ')
            cursor_intervals_vectors = [
                x.split(', ') 
                for x in cursor_intervals_separate
                ]
            cursor_interval_from = [
                int(x) for x in cursor_intervals_vectors[0]
                ]
            cursor_interval_to = [
                int(x) for x in cursor_intervals_vectors[1]
                ]

            # "the blue cat ran", move "blue" to
            # "the cat blue ran"
            # note: no change in total text length

            if cursor_interval_from[0] != cursor_interval_to[0]:

                if cursor_interval_from[0] < cursor_interval_to[0]:
                    
                    essay_text = (
                        # all text preceding move-impacted window
                        essay_text[:cursor_interval_from[0]] +
                        # skip where moved block _was_,
                        # proceed to end of move-impacted window
                        essay_text[cursor_interval_from[1]:cursor_interval_to[1]] +
                        # add moved block
                        essay_text[cursor_interval_from[0]:cursor_interval_from[1]] + 
                        # all text proceeding move-impacted window
                        essay_text[cursor_interval_to[1]:]
                    )

                # "the cat ran fast", move "ran" to 
                # "ran the cat fast"
                else:

                    essay_text = (
                        # all text preceding move-impacted window
                        essay_text[:cursor_interval_to[0]] + 
                        # add moved block
                        essay_text[cursor_interval_from[0]:cursor_interval_from[1]] +
                        # skip moved block, still within move-impacted window
                        essay_text[cursor_interval_to[0]:cursor_interval_from[0]] + 
                        # all text proceeding move-impacted window
                        essay_text[cursor_interval_from[1]:]
                    )
      
            continue
        

        cursor_position_start_text_change = (
            cursor_position_after_event - len(text_change_log)
            )
        essay_text = (
            essay_text[:cursor_position_start_text_change] 
            + text_change_log
            + essay_text[cursor_position_start_text_change:]
            )
        
    return pd.DataFrame({'id': df['id'].unique(), 'essay': [essay_text]})

In [None]:
def enrich_logs(X, is_training_run):

    X = enrich_activity(X, is_training_run)
    print("Enriched activity")

    # live test data raise Exception during decode-encode attempt.
    # still, higher quality model should follow from 
    # higher-quality train data 
    if is_training_run:
        X = scrub_text_change(X)

    X = enrich_pauses(X)
    print("Enriched pauses")

    X = enrich_time_bursts(X, is_training_run)
    print("Enriched time bursts")

    X = enrich_activity_streaks(X)
    print("Enriched activity streaks")

    X = enrich_word_count(X)
    print("Enriched word count")

    X = enrich_cursor_position(X)
    print("Enriched cursor position")

    X = enrich_time_windows(X)
    print("Enriched time windows")

    return subset_features(X)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize_essay_text(
    X, is_training_run, vocabulary=PRETRAINED_TEXT_VOCABULARY
    ):
    """
    Given higher-order ngram, expect large vocabulary for vectorizer.
    Might prefer pre-trained vocabulary with known phrase-index mappings:
    where indexes have been pre-screened for importance in feature selection.
    """

    essays_text = pd.concat([
        concatenate_essay_from_logs(x) 
        for _, x in X.groupby('id')
        ], axis=0).reset_index(drop=True)
    
    corpus = essays_text['essay'].to_list()

    if vocabulary:
        
        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1, 4),
            vocabulary=vocabulary
            )
    
    elif is_training_run:
        
        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1, 4)
            )

        pipeline.fit(corpus)

        with open("pipeline_text_vectorizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_text_vectorizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    essay_vectorized = pipeline.transform(corpus)
    essay_vectorized = pd.DataFrame(
        essay_vectorized.toarray(),
        index=essays_text['id'].values
        )
    essay_vectorized.columns = [
        'vocab' + str(x) for x in essay_vectorized.columns
        ]

    return essay_vectorized

In [None]:
def vectorize_events(
    X, is_training_run, vocabulary=PRETRAINED_EVENTS_VOCABULARY
    ):
    """
    A keylog "event" differs from an activity. Event examples include:
    leftclick, rightclick, capslock, arrow{direction}, ...
    Why calculate? Competition has found value in these features.
    """
    
    expr = {'down_event_seq': " ".join}
    X_events = X.groupby('id')['down_event'].agg(**expr).reset_index(drop=False)

    corpus = X_events['down_event_seq'].to_list()

    if vocabulary:

        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1,1),
            vocabulary=vocabulary
            )

    elif is_training_run:

        pipeline = CountVectorizer(
            input='content',
            ngram_range=(1,1)
            )

        pipeline.fit(corpus)

        with open("pipeline_events_vectorizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_events_vectorizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    events_vectorized = pipeline.transform(corpus)
    events_vectorized = pd.DataFrame(
        events_vectorized.toarray(),
        index=X_events['id'].values
        )
    events_vectorized.columns = [
        'event' + str(x) for x in events_vectorized.columns
        ]

    return events_vectorized
    

In [None]:
def aggregate_essay_text_features(X):
    """
    Aggregates covering final writing product, not writing process narrowly.
    """

    essays_text = pd.concat(
        [concatenate_essay_from_logs(x) for _, x in X.groupby('id')], axis=0
        ).reset_index(drop=True)
    
    # two consecutive newlines constitute one effective
    # no paragraph breaks imply, all 1 paragraph
    essays_text['n_paragraphs'] = essays_text['essay'].str.count("[\n]+")
    essays_text.loc[essays_text['n_paragraphs'] == 0, 'n_paragraphs'] = 1
    essays_text['paragraphs'] = essays_text['essay'].str.split("[\n]+")
    essays_text['n_sentences_by_paragraph'] = (
        essays_text['paragraphs']
        .apply(lambda paragraphs: np.array([
            len(re.findall("[\.]+|[?]+|[!]+", p)) 
            for p in paragraphs
            ]) 
            )
        )
    # for bounds guidance, see overall distribution
    varnames_n_paragraphs_by_n_sentences_bin = []
    for geq_low, lt_high in [
        (0, 2),
        (2, 3),
        (3, 4),
        (4, 5),
        (5, 6),
        (6, 7),
        (7, 10),
        (10, 20),
        (20, 50)
        ]:

        bin_var = f'n_paragraphs_with_n_sentences_geq{geq_low}_lt{lt_high}'
        varnames_n_paragraphs_by_n_sentences_bin += [bin_var, bin_var + "_frac"]

        essays_text[bin_var] = (
            essays_text['n_sentences_by_paragraph']
            .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
            )
        
        essays_text[bin_var + "_frac"] = (
            essays_text[bin_var] / essays_text['n_paragraphs']
            )


    # sentences split can leave last hanging ' ', 
    # if not scrubbed by search for 'q'
    essays_text['sentences'] = essays_text['essay'].str.split("[\.]+|[?]+|[!]+")
    essays_text['sentences'] = (
        essays_text['sentences']
        .apply(lambda sentences: [s for s in sentences if 'q' in s])
    )
    essays_text['n_sentences'] = (
        essays_text['sentences']
        .apply(lambda s_split: len(s_split))
    )

    essays_text['words_by_sentence'] = (
        essays_text['sentences']
        .apply(lambda sentences: [s.split() for s in sentences])
    )
    essays_text['i_words_by_sentence'] = (
        essays_text['words_by_sentence']
        .apply(lambda sentences: np.array([len(s) for s in sentences]))
    )

    # for bounds guidance, see overall distribution
    varnames_n_sentences_by_word_count_bin = []
    for geq_low, lt_high in [
        (0, 5),
        (5, 10),
        (10, 15),
        (15, 20),
        (20, 25),
        (25, 30),
        (30, 50),
        (50, 5000)
        ]:

        bin_var = f'n_sentences_words_geq{geq_low}_lt{lt_high}'
        varnames_n_sentences_by_word_count_bin += [bin_var, bin_var + "_frac"]

        essays_text[bin_var] = (
            essays_text['i_words_by_sentence']
            .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
            )
        
        essays_text[bin_var + "_frac"] = (
            essays_text[bin_var] / essays_text['n_sentences']
            )


    essays_text['words'] = essays_text['essay'].str.split(" +", regex=True)
    essays_text["word_count_reconstructed"] = (
        essays_text
        ["words"]
        .apply(lambda x: len(x))
    )
    essays_text["words_length"] = (
        essays_text["words"]
        .apply(lambda x: np.array([len(a) for a in x]))
    )

    # for bounds guidance, see distribution of word lengths
    varnames_i_words_by_length_bin = []
    for geq_low, lt_high in [
        (0, 2),
        (2, 3),
        (3, 4),
        (4, 5),
        (5, 6),
        (6, 7),
        (7, 8),
        # "incomprehensible" is a reasonable, long (21-char) word
        (8, 25),
        (25, 500)
    ]:
        bin_var = f'words_length_geq{geq_low}_lt{lt_high}'
        varnames_i_words_by_length_bin += [bin_var, bin_var + "_frac"]

        essays_text[bin_var] = (
            essays_text['words_length']
            .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
            )
        essays_text[bin_var + "_frac"] = (
            essays_text[bin_var] / essays_text['word_count_reconstructed']
            )


    essays_text['n_thought_delimiting_punctuation'] = (
        essays_text
        ['essay']
        .str
        .count("[\.]+|[?]+|[!]+|[,]+|[-]+|[;]+|[:]+|[—]+")
        )
    essays_text["words_per_thought_delimiting_punctuation_avg"] = (
        essays_text["word_count_reconstructed"] / 
        essays_text['n_thought_delimiting_punctuation']
    )
    essays_text['n_commas'] = essays_text['essay'].str.count("[,]")
    essays_text['n_dashes'] = essays_text['essay'].str.count("[-]")
    essays_text['n_semicolons'] = essays_text['essay'].str.count("[;]")
    essays_text['n_questions'] = essays_text['essay'].str.count("[?]")
    essays_text['n_exclaims'] = essays_text['essay'].str.count("[!]")

    essays_text['n_parenthetical_punctuation'] = (
        essays_text
        ['essay']
        .str
        .count("\(|\)|\[|\]|\*|{|}")
    )

    essays_text['n_quant_punctuation'] = (
        essays_text['essay'].str.count("=|>|<|\$|\%|\+")
        )

    essays_text['n_apostrophe'] = essays_text['essay'].str.count("'")

    essays_text['n_quotes'] = essays_text['essay'].str.count("\"")

    essays_text['n_shortening_punctuation'] = (
        essays_text['essay'].str.count("&|@")
        )

    essays_text['n_characters'] = essays_text['essay'].str.len()


    for var in ['i_words_by_sentence', 'words_length']:
        essays_text[f"{var}_mean"] = essays_text[var].apply(lambda x: x.mean())
        essays_text[f"{var}_p50"] = (
            essays_text[var].apply(lambda x: np.nanquantile(x, 0.5))
            )
        essays_text[f"{var}_stddev"] = essays_text[var].apply(lambda x: x.std())


    aggregates_essay_text = essays_text[[
        'id',
        'n_paragraphs', 
        'n_sentences', 
        
        'n_thought_delimiting_punctuation',
        "words_per_thought_delimiting_punctuation_avg",
        'n_parenthetical_punctuation',
        'n_quant_punctuation',
        'n_apostrophe',
        'n_quotes',
        'n_shortening_punctuation',
        "n_commas",
        "n_dashes",
        "n_semicolons",
        "n_questions",
        "n_exclaims",

        "n_characters",

        "i_words_by_sentence_mean",
        "words_length_mean",
        "i_words_by_sentence_p50",
        "words_length_p50",
        "i_words_by_sentence_stddev",
        "words_length_stddev"
        ]

        + varnames_n_paragraphs_by_n_sentences_bin

        + varnames_n_sentences_by_word_count_bin

        + [x for x in varnames_i_words_by_length_bin if '_frac' in x]
        
        ]
    aggregates_essay_text = aggregates_essay_text.set_index('id')

    return aggregates_essay_text

In [None]:
# for meaningful summary of a log field, aggregation may vary.
    # if quantity cumulates, then sum
        # if discrete event, then frequency per unit time also meaningful
    # if quantity's distribution is interesting, summarize
        # if quantity is continuous, describe complete distr by histogramming
    
event_vars_sum = (
    ['activity_' + x for x in ACTIVITY_CATEGORIES] 
    + ['is_new_burst_start'] 
    + ['is_new_burst_start_' + x for x in ACTIVITY_CATEGORIES]
    + ["is_new_activity_streak_start_" + x for x in ACTIVITY_CATEGORIES]
    + ['word_count_delta']
    )

conti_vars_sum = ["preceding_pause_time", "latency_time"]

distribution_vars = [
    'latency_time', 
    'preceding_pause_time', 
    'cursor_position_delta',
    'word_count_delta_burst_thin',
    'word_count_delta_activity_streak_thin',
    'activity_streak_length_thin',
    'cursor_position_vs_max'  
]


def aggregate_no_time_dependence_measures(X, is_training_run):
    """
    Aggregate measures irrespective of time dependence. 
    Ex: sum of inputs over entire essay.
    """

    # discretizing conti var allows sum of vars, as though they were events.
    # because discretization expands columns via one-hot,
    # reduce dataset to small-as-possible.
    # extracting non-float id allows ColumnTransformer's properly typed numpy
    X_attributes = X[['id']]
    X_to_sum = X[event_vars_sum + distribution_vars]
    X_orig_to_sum = X_to_sum[conti_vars_sum].copy()

    if is_training_run:

        pipeline = ColumnTransformer(
            transformers=[(
                'discretizer', 
                preprocessing.KBinsDiscretizer(
                    n_bins=10, 
                    encode='onehot-dense', 
                    strategy='quantile'
                    ),
                distribution_vars
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )

        # if nulls not explicitly handled, Exception raises
        pipeline.fit(X_to_sum.fillna(-1))
        with open("pipeline_no_time_dep_discretizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_no_time_dep_discretizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    # follow pipeline fit nulls treatment
    X_to_sum = pipeline.transform(X_to_sum.fillna(-1))

    X_to_sum = pd.DataFrame(X_to_sum, columns=pipeline.get_feature_names_out())
    X_to_sum = pd.concat([X_attributes, X_to_sum, X_orig_to_sum], axis=1)
    # cols_in = set(pipeline.feature_names_in_)
    # cols_out = set(pipeline.get_feature_names_out())
    # distribution_vars_discretized = cols_out.difference(cols_in)

    # X_to_sum['nobs'] = 1
    # with distribution_vars discretized, everything sums
    sums_over_time = X_to_sum.groupby('id').agg(sum)
    # for var in distribution_vars_discretized:
    #     sums_over_time[var + '_share_distr'] = (
    #         sums_over_time[var] / sums_over_time['nobs']
    #     )
    # sums_over_time.drop(columns='nobs', inplace=True)
    sums_over_time['delete_insert_ratio'] = (
        sums_over_time['activity_Remove/Cut'] / 
        sums_over_time['activity_Input'] 
        )
    del X_to_sum


    expr = {}
    for var in distribution_vars:
        expr[f"{var}_mean"] = (var, 'mean')
        expr[f"{var}_p50"] = (var, np.median)
        expr[f"{var}_stddev"] = (var, np.std)
    expr['preceding_pause_time_max'] = ('preceding_pause_time', 'max')
    expr['initial_pause_time_max'] = ('preceding_pause_time_start_window', 'max')
    expr["total_time"] = ('up_time', 'max')
    expr['is_time_beyond_expected_max'] = ('is_time_beyond_expected_max', 'max')

    distribution_summaries = X.groupby('id').agg(**expr)
    distribution_summaries['is_initial_pause_max_pause'] = (
        distribution_summaries['preceding_pause_time_max'] == 
        distribution_summaries['initial_pause_time_max']
        ).astype(int)


    aggregates_essay_text = aggregate_essay_text_features(X)


    # literature finds information in pauses' lognorm distribution
    mle_summary_subjects = []
    for X_subject in [x for _, x in X.groupby('id')]:

        subject_id = X_subject['id'].iloc[0]
        mle_by_var = {}
        for var in ['preceding_pause_time', 'latency_time']:
            shape, location, scale = lognorm.fit(X_subject[var].dropna())
            mle_by_var[f"{var}_lognorm_shape"] = shape
            mle_by_var[f"{var}_lognorm_location"] = location
            mle_by_var[f"{var}_lognorm_scale"] = scale

        mle_by_var = pd.DataFrame(mle_by_var, index=[subject_id])
        mle_by_var = mle_by_var.fillna(-1)

        mle_summary_subjects.append(mle_by_var)

    distr_params_over_time = pd.concat(mle_summary_subjects, axis=0)


    aggregates_over_time = pd.merge(
        sums_over_time, 
        distribution_summaries,
        how='left',
        left_index=True,
        right_index=True
        )

    aggregates_over_time = pd.merge(
        aggregates_over_time, 
        distr_params_over_time,
        how='left',
        left_index=True,
        right_index=True
        )
    
    aggregates_over_time = pd.merge(
        aggregates_over_time, 
        aggregates_essay_text,
        how='left',
        left_index=True,
        right_index=True
        )
    

    for var in event_vars_sum:

        aggregates_over_time[var + '_per_s'] = 1000*(
            (aggregates_over_time[var] / aggregates_over_time['total_time'])
            )

    aggregates_over_time = (
        aggregates_over_time
        .assign(
            keystroke_speed = lambda x: (x.activity_Input + x['activity_Remove/Cut']) / x.total_time,
            pause_time_fraction = lambda x: x.preceding_pause_time / x.total_time
            )
        )
    
    
    return aggregates_over_time

In [None]:
def aggregate_time_variability_measures(
    X, aggregates_over_time, is_training_run
    ):
    """
    Tabulate author's measures by fixed time window (ex: 30-second increments),
    and derive features from that by-time window distribution.

    Use over-time aggregates to normalize select by-time window tabulations. 
    """

    # need to sum events, conti vars by fixed-time window.
    # ensure a writer's fixed-time windows are all used -- drop excess ones.
    # for events, normalize by overall average event rates, & overall sums.
    # for conti var, normalize by overall sums.

    # then, over time windows, compute percentiles. this is novel for event vars,
    # which lack percentiles over all time. p90_time_window

    sums_by_window = (
        X
        .groupby(['id', 'window_30s'])
        [event_vars_sum + conti_vars_sum]
        .agg(sum)
        .astype(float)
        .fillna(0)
        .reset_index(drop=False)
    )
    sums_by_window['delete_insert_ratio'] = (
        sums_by_window['activity_Remove/Cut'] / 
        sums_by_window['activity_Input'] 
        ).replace(np.inf, np.nan)


    # by default, every categorical time window ever observed across data
    # tabulates for every writer. instead, per writer, truncate to time windows
    # actually used.
    sums_by_window['has_activity'] = (
        sums_by_window
        [['activity_' + x for x in ACTIVITY_CATEGORIES]].sum(axis=1) 
        > 0
    )
    sums_by_window['idx_window_by_id'] = (
        sums_by_window
        .groupby('id')
        .cumcount()
    )
    sums_by_window['idx_has_activity'] = np.where(
        sums_by_window['has_activity'], 
        sums_by_window['idx_window_by_id'],
        np.nan
        )
    sums_by_window['idx_activity_max'] = (
        sums_by_window
        .groupby(['id'])
        ['idx_has_activity']
        .transform('max')
    )
    sums_by_window = (
        sums_by_window
        .loc[sums_by_window['idx_window_by_id'] <= sums_by_window['idx_activity_max']]
        .drop(columns=['has_activity', 'idx_has_activity', 'idx_activity_max'])
    )


    # for variability measure more comparable between writers, de-mean by writer. 
    # Ex: higher-throughput writer incurs higher stddev, 
    # because values have higher magnitude.

    # join method allows for merge on one index column, of multiple possible
    sums_by_window = sums_by_window.join(
        aggregates_over_time[[x + '_per_s' for x in event_vars_sum]],
        on='id',
        how='left'
        )
    for var in event_vars_sum:
        sums_by_window[var + '_time_norm'] = (
            sums_by_window[var] / 
            (sums_by_window[var + '_per_s'].replace(0, None) * 30)
            ).fillna(1)
    sums_by_window.drop(
        columns=[x + '_per_s' for x in event_vars_sum],
        inplace=True
        )

    sums_over_time_ren = aggregates_over_time[event_vars_sum + conti_vars_sum]
    sums_over_time_ren.columns = [
        x + "_total" for x in sums_over_time_ren.columns
        ]
    sums_by_window = sums_by_window.join(sums_over_time_ren, on='id', how='left')
    for var in event_vars_sum + conti_vars_sum:
        sums_by_window[var + '_frac_total'] = (
            sums_by_window[var] / 
            sums_by_window[var + '_total'].replace(0, None)
            ).fillna(1)
    sums_by_window.drop(
        columns=[x + '_total' for x in event_vars_sum + conti_vars_sum],
        inplace=True
        )


    expr = {}
    distr_vars = (
        event_vars_sum
        + conti_vars_sum
        + [var + '_time_norm' for var in event_vars_sum]
        + [var + '_frac_total' for var in event_vars_sum]
        + [var + '_frac_total' for var in conti_vars_sum]
        )
    X_attributes = sums_by_window[['id']]
    X_to_sum = sums_by_window[distr_vars]
    if is_training_run:

        pipeline = ColumnTransformer(
            transformers=[(
                'discretizer', 
                preprocessing.KBinsDiscretizer(
                    n_bins=10, 
                    encode='onehot-dense', 
                    strategy='quantile'
                    ),
                distr_vars
            )],
            remainder='passthrough',
            verbose_feature_names_out=False
            )

        # if nulls not explicitly handled, Exception raises
        pipeline.fit(X_to_sum.fillna(-1))
        with open("pipeline_time_dep_discretizer.pkl", "wb") as f:
            pickle.dump(pipeline, f)

    else:
        with open("pipeline_time_dep_discretizer.pkl", "rb") as f:
            pipeline = pickle.load(f)

    # follow pipeline fit nulls treatment
    X_to_sum = pipeline.transform(X_to_sum.fillna(-1))

    X_to_sum = pd.DataFrame(X_to_sum, columns=pipeline.get_feature_names_out())
    X_to_sum = pd.concat([X_attributes, X_to_sum], axis=1)
    # cols_in = set(pipeline.feature_names_in_)
    # cols_out = set(pipeline.get_feature_names_out())
    # distribution_vars_discretized = list( cols_out.difference(cols_in) )

    # X_to_sum['nobs'] = 1
    # with distribution_vars discretized, everything sums
    distr_summaries = X_to_sum.groupby('id').agg(sum)
    # for var in distribution_vars_discretized:
    #     distr_summaries[var + '_share_distr'] = (
    #         distr_summaries[var] / distr_summaries['nobs']
    #     )
    # distr_summaries.drop(columns='nobs', inplace=True)
    distr_summaries.columns = [
        x + '_time_window' for x in distr_summaries.columns
        ]
    

    entropy_by_window = (
        sums_by_window
        .groupby(['id'])
        [[var for var in sums_by_window.columns if 'frac_total' in var]]
        .agg(lambda x: entropy(x.value_counts()))
        )
    entropy_by_window.columns = [
        x + '_entropy' 
        for x in entropy_by_window.columns
        ]


    trend_by_window = (
        sums_by_window
        .sort_values(['id', 'idx_window_by_id'])
        .drop(columns=['window_30s'])
        .groupby(['id'])
        [['idx_window_by_id'] + event_vars_sum + conti_vars_sum]
        .corr()
        )
    trend_by_window = trend_by_window.fillna(0)
    # extract correlations strictly with time index
    trend_by_window = trend_by_window.xs('idx_window_by_id', level=1)
    trend_by_window.columns = [x + "_ttrend" for x in trend_by_window.columns]


    vari_by_window = pd.merge(
        distr_summaries,
        entropy_by_window,
        how='left',
        left_index=True,
        right_index=True
        )


    vari_by_window = pd.merge(
        vari_by_window,
        trend_by_window,
        how='left',
        left_index=True,
        right_index=True
        )     
    
    
    return vari_by_window

In [None]:
def feature_transform_pipeline(X_logs, is_training_run):

    X_logs_enriched = enrich_logs(X_logs, is_training_run)

    vectorized_text = vectorize_essay_text(X_logs_enriched, is_training_run)

    vectorized_events = vectorize_events(X_logs_enriched, is_training_run)

    aggregates_over_time = aggregate_no_time_dependence_measures(
        X_logs_enriched, is_training_run
        )
    vari_by_window = aggregate_time_variability_measures(
        X_logs_enriched, aggregates_over_time, is_training_run
        )

    X_transform = pd.merge(
        aggregates_over_time,
        vari_by_window,
        how='left',
        left_index=True,
        right_index=True
        )

    X_transform = pd.merge(
        X_transform,
        vectorized_text,
        how='left',
        left_index=True,
        right_index=True
        )

    X_transform = pd.merge(
        X_transform,
        vectorized_events,
        how='left',
        left_index=True,
        right_index=True
        )

    return X_transform

In [None]:
# hypothesis: deriving feature transform pipeline from entire data,
# then splitting and conducting feature selection,
# promotes data leakage (overfitting).
# consider few validation cases with rare one-hot or vectorized feature,
# and also anomalous outcome.
# model may overweight relationship between those rare features & anomalous outcome.

# overfit happens when a model overweights an X-y relationship.
# current process:
# - over entire train data, transform raw features 
#   (nonparametric transformers, like vectorizers, will learn from validation set)
# - split train data
# - feature selection: permutation importance strictly on train
# - hypertune: use all features (some may have been unique to validation, but into train)
    # example: only in validation set did 10-letter words exist

# we know there are material outliers in the validation set.
# suppose a couple rare features on those outliers
# those features are made available during training due to full train batch
# feature transform pipeline fit. [would not have been the case if, 
# we fit pipeline only on train subset].
# then, in hyperpar opt tuning, optimization routine thinks it's found signal
# when use of rare feature decreases validation set loss.
# however, that rare feature would not even have been available for model fit

In [None]:
X_logs = extract(PATH_TRAIN_LOGS)

# fit feature transform pipeline with strictly train subset.
# risk data leakage if pipeline fit over entire data, then data split
# persist consistent splits for different scripts' use
from sklearn.model_selection import train_test_split
ids = X_logs[['id']].drop_duplicates().reset_index(drop=True)
ids_train, ids_test = train_test_split(ids, test_size=0.33, random_state=777)
ids_train = ids_train.reset_index(drop=True)
ids_test = ids_test.reset_index(drop=True)

X_train_logs = pd.merge(X_logs, ids_train, how='inner').reset_index(drop=True)
X_test_logs = pd.merge(X_logs, ids_test, how='inner').reset_index(drop=True)
del X_logs

In [None]:
X_train = feature_transform_pipeline(X_train_logs, True)
del X_train_logs

In [None]:
X_test = feature_transform_pipeline(X_test_logs, False)
del X_test_logs

In [None]:
# can't learn from zero-variance features
has_zero_var_col = (X_train.std() == 0).to_dict()
has_zero_var_col = [
    x for x, has_zero_var in has_zero_var_col.items()
    if has_zero_var
    ]
X_train = X_train.drop(columns=has_zero_var_col)

In [None]:
assert(all(X_train.notnull()))

In [None]:
y = pd.read_csv(PATH_TRAIN_OUTCOMES)
y = y.set_index("id")
y = y.rename(columns={"score": "y"})

XY = pd.merge(X_train, y, how="left", left_index=True, right_index=True)
y_train = XY['y']
del XY

XY_test = pd.merge(X_test, y, how="left", left_index=True, right_index=True)
y_test = XY_test['y']
del XY_test

del y

In [None]:
if FEATURES_PRESELECTED:
    features_keep = FEATURES_PRESELECTED

else:

    # expect large universe of possible features --
    # then, optuna runs very slowly, model fitting generally is an issue.
    # that's besides concerns of noise features.
    # use random forest for feature selection.

    from sklearn.ensemble import RandomForestRegressor
    from sklearn.inspection import permutation_importance

    N_TOP_FEATURES_KEEP = 1000

    model = RandomForestRegressor(
        n_estimators=500,
        max_features="sqrt",
        max_depth=None,
    )
    model.fit(X_train, y_train.values)

    result = permutation_importance(model, X_train, y_train, n_repeats=5, n_jobs=-1)
    feature_imp = pd.DataFrame({
        'feature': X_train.columns,
        'score': result.importances_mean
        }).sort_values('score', ascending=False).reset_index(drop=True)

    feature_imp.to_csv("feature_selection_importances.csv", index=False)

    features_keep = feature_imp['feature'].iloc[0:N_TOP_FEATURES_KEEP]
    # features_keep = X.columns

In [None]:
X_train = X_train[features_keep]
X_test = X_test[features_keep]

In [None]:
X_train.to_pickle("./data/processed/X_train.pkl")
y_train.to_pickle("./data/processed/y_train.pkl")

X_test.to_pickle("./data/processed/X_test.pkl")
y_test.to_pickle("./data/processed/y_test.pkl")