In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


In [95]:
# Load data into DataFrames
train_logs = pd.read_csv('input/train_logs.csv')
test_logs = pd.read_csv('input/test_logs.csv')
train_scores = pd.read_csv('input/train_scores.csv')
train_essays = pd.read_csv('output/train_essays.csv')

In [87]:
# Establish some variables and functions for testing and evaluating

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

rs = 0
def round_scores(scores):
    scores = ((scores * 2) // 1) * .5
    scores = np.where(scores < 0, 0.0, scores)
    scores = np.where(scores > 6, 6.0, scores)
    return scores

We will extract several different kinds of features and add them to separate dataframes
1. Word frequency (using TF-IDF scores) from the "words" of the reconstructed essays
2. Aggregations of events, text changes, &c along with event index gaps
3. Gap info

In [85]:
# Begin with "word" analysis

from sklearn.feature_extraction.text import CountVectorizer

def get_tdidf(essay_df):
    # Get counts of all 1, 2, and 3-grams
    count_vectorizer = CountVectorizer(ngram_range=(1, 3))
    X_tokenizer_train = count_vectorizer.fit_transform(essay_df['essay']).todense()

    # Create a matrix of 1,0 values to keep track of which grams appear in which essays at least once
    X_tokenizer_reduced = X_tokenizer_train.copy()
    for i in reversed(range(X_tokenizer_reduced.shape[1])):
        X_tokenizer_reduced[:,i] = np.where(X_tokenizer_reduced[:,i] > 0, 1, 0)

    # Create tdidf DataFrame
    col_names = [f'word_group_{i}' for i in range(X_tokenizer_train.shape[1])]
    X_tokenizer_final = pd.DataFrame(X_tokenizer_train.copy(), columns=col_names)
    idfs = []
    for i in range(X_tokenizer_final.shape[1]):
        idfs.append(np.log(X_tokenizer_final.shape[0] / np.sum(X_tokenizer_reduced[:,i])))
    def compute_tfidf(row):
        sum = np.sum([row[col] for col in col_names])
        for i in range(X_tokenizer_train.shape[1]):
            row[f'word_group_{i}'] = row[f'word_group_{i}'] * idfs[i] / sum
        return row
    X_tokenizer_final = X_tokenizer_final.apply(compute_tfidf, axis = 1)
    return X_tokenizer_final


In [86]:
train_ngram_tdidf = get_tdidf(train_essays)
train_ngram_tdidf.to_csv('output/train_ngram_tdidf.csv')
train_ngram_tdidf.head(3)

Unnamed: 0,word_group_0,word_group_1,word_group_2,word_group_3,word_group_4,word_group_5,word_group_6,word_group_7,word_group_8,word_group_9,...,word_group_3385,word_group_3386,word_group_3387,word_group_3388,word_group_3389,word_group_3390,word_group_3391,word_group_3392,word_group_3393,word_group_3394
0,0.0,0.0,0.0,0.0,0.000152,0.000928,0.000472,0.001125,0.000711,0.00077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000141,0.000753,0.000767,0.000609,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,8.4e-05,0.000512,0.00026,0.000207,0.000392,0.000849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
# Establish baseline model prediction

X, y = train_ngram_tdidf, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = round_scores(model.predict(X_train))
y_predict_valid = round_scores(model.predict(X_valid))
print('MAE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MAE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MAE: training set
0.4933771910329651
MAE: validation set
0.7763586758208265


Next, evaluate gap info

In [193]:
def get_iki_info(logs_df):
    logs_df['prev_up_time'] = logs_df.groupby('id')['up_time'].shift(1).fillna(logs_df['down_time'])
    logs_df['inter_keystroke_intvl'] = (abs(logs_df['down_time'] - logs_df['prev_up_time'])) / 1000
    group = logs_df.groupby('id')['inter_keystroke_intvl']
    total_ikis = group.count()
   
    data =  pd.DataFrame({
        'id': logs_df['id'].unique(),
        'longest_iki': group.max(),
        'median_iki': group.median(),
        'mean_iki': group.mean(),
        'initial_pause': group.apply(lambda ikis: max(ikis.values[:10])),
        'start_time': logs_df.groupby('id')['down_time'].first()
    }).reset_index(drop=True)

    ms_thresholds = [100, 300, 500, 1000, 2000, 10000, 30000, 60000]
    for ms in ms_thresholds:
        def get_runs(ikis):
            runs = []
            curr_run = 0
            for i, val in enumerate(ikis.values[1:]):
                if val > ms / 1000 or i == len(ikis.values) - 2:
                    if curr_run >= 2:
                        runs.append(curr_run)
                    curr_run = 0
                else:
                    curr_run += 1
            return runs
        runs_series = group.apply(get_runs)
        # count total number of ikis in runs (not number of runs)
        data[f'{ms}ms_run_count'] = runs_series.apply(lambda runs: np.sum(runs)).values
        data[f'{ms}ms_long_run_count'] = runs_series.apply(lambda runs: np.sum([n for n in runs if n >= 10])).values
        # count longest run
        data[f'{ms}ms_max_run'] = runs_series.apply(lambda runs: max(runs)).values
        # count portion of ikis under threshold
        num_ikis = group.apply(lambda x: len([n for n in x.values if n < ms / 1000])).values
        data[f'portion_ikis_under_{ms}ms'] = num_ikis / total_ikis.values
    return data.drop('id', axis=1)
iki_info = get_iki_info(train_logs[['id', 'up_time', 'down_time']])
iki_info.head(3)


Unnamed: 0,longest_iki,median_iki,mean_iki,initial_pause,start_time,100ms_run_count,100ms_long_run_count,100ms_max_run,portion_ikis_under_100ms,300ms_run_count,...,10000ms_max_run,portion_ikis_under_10000ms,30000ms_run_count,30000ms_long_run_count,30000ms_max_run,portion_ikis_under_30000ms,60000ms_run_count,60000ms_long_run_count,60000ms_max_run,portion_ikis_under_60000ms
0,154.136,0.062,0.610944,101.609,4526,1618,245.0,24,0.686742,2065,...,389,0.991396,2549,2549,1338,0.998045,2552,2552,2500,0.999218
1,145.899,0.061,0.620108,1.696,30623,1603,640.0,61,0.689487,2047,...,770,0.988183,2443,2431,1459,0.99674,2449,2449,2321,0.998778
2,153.886,0.04,0.360506,16.736,4441,3333,2300.0,39,0.821809,3794,...,681,0.995164,4123,4123,766,0.99734,4130,4130,2528,0.999033


In [196]:
# Establish baseline model prediction

X, y = iki_info, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = round_scores(model.predict(X_train))
y_predict_valid = round_scores(model.predict(X_valid))
print('MAE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MAE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MAE: training set
0.6096114999216363
MAE: validation set
0.7899290184890109
