In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


In [391]:
# Load data into DataFrames
train_logs = pd.read_csv('input/train_logs.csv')
test_logs = pd.read_csv('input/test_logs.csv')
train_scores = pd.read_csv('input/train_scores.csv')
train_essays = pd.read_csv('output/train_essays.csv')

In [198]:
# Establish some variables and functions for testing and evaluating

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

rs = 0

We will extract several different kinds of features and add them to separate dataframes
1. Word frequency (using TF-IDF scores) from the "words" of the reconstructed essays
2. Aggregations of events, text changes, &c along with event index gaps
3. Gap info

In [300]:
# Begin with "word" analysis

from sklearn.feature_extraction.text import CountVectorizer

def get_tdidf(essay_df):
    # Get counts of all 1, 2, and 3-grams
    count_vectorizer = CountVectorizer(ngram_range=(1, 3))
    X_tokenizer_train = count_vectorizer.fit_transform(essay_df['essay']).todense()
    group_names = count_vectorizer.get_feature_names_out()
    # Create a matrix of 1,0 values to keep track of which grams appear in which essays at least once
    X_tokenizer_reduced = X_tokenizer_train.copy()
    for i in reversed(range(X_tokenizer_reduced.shape[1])):
        X_tokenizer_reduced[:,i] = np.where(X_tokenizer_reduced[:,i] > 0, 1, 0)

    # Create tdidf DataFrame
    col_names = [f'word_group_{group_names[i]}' for i in range(X_tokenizer_train.shape[1])]
    X_tokenizer_final = pd.DataFrame(X_tokenizer_train.copy(), columns=col_names)
    idfs = []
    for i in range(X_tokenizer_final.shape[1]):
        idfs.append(np.log(X_tokenizer_final.shape[0] / np.sum(X_tokenizer_reduced[:,i])))
    def compute_tfidf(row):
        sum = np.sum([row[col] for col in col_names])
        for i in range(X_tokenizer_train.shape[1]):
            row[f'word_group_{group_names[i]}'] = row[f'word_group_{group_names[i]}'] * idfs[i] / sum
        return row
    X_tokenizer_final = X_tokenizer_final.apply(compute_tfidf, axis = 1)
    return X_tokenizer_final


In [304]:
train_ngram_tdidf = get_tdidf(train_essays)
train_ngram_tdidf.to_csv('output/train_ngram_tdidf.csv')
train_ngram_tdidf.head(3)

Unnamed: 0,word_group__qqqq,word_group__qqqq qqqqqqqqq,word_group__qqqq qqqqqqqqq qqqqqqqqqqq__,word_group_qq,word_group_qq qq,word_group_qq qq qq,word_group_qq qq qqq,word_group_qq qq qqqq,word_group_qq qq qqqqq,word_group_qq qq qqqqqq,...,word_group_qqqqqâ qqqqqq qq,word_group_qåäqqqqqqqqqq,word_group_qåäqqqqqqqqqq qqq,word_group_qåäqqqqqqqqqq qqq qqqqq,word_group_äq,word_group_äq qq,word_group_äq qq qq,word_group_ëqqqqqqqqq,word_group_ëqqqqqqqqq qqq,word_group_ëqqqqqqqqq qqq qqqqqqq
0,0.0,0.0,0.0,0.0,0.000152,0.000928,0.000472,0.001125,0.000711,0.00077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000141,0.000753,0.000767,0.000609,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,8.4e-05,0.000512,0.00026,0.000207,0.000392,0.000849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
# Establish baseline model prediction

X, y = train_ngram_tdidf, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.4011591562826782
MSQE: validation set
0.7049318672765424


Next, evaluate gap info

In [193]:
def get_iki_info(logs_df):
    logs_df['prev_up_time'] = logs_df.groupby('id')['up_time'].shift(1).fillna(logs_df['down_time'])
    logs_df['inter_keystroke_intvl'] = (abs(logs_df['down_time'] - logs_df['prev_up_time'])) / 1000
    group = logs_df.groupby('id')['inter_keystroke_intvl']
    total_ikis = group.count()
   
    data =  pd.DataFrame({
        'id': logs_df['id'].unique(),
        'longest_iki': group.max(),
        'median_iki': group.median(),
        'mean_iki': group.mean(),
        'initial_pause': group.apply(lambda ikis: max(ikis.values[:10])),
        'start_time': logs_df.groupby('id')['down_time'].first()
    }).reset_index(drop=True)

    ms_thresholds = [100, 300, 500, 1000, 2000, 10000, 30000, 60000]
    for ms in ms_thresholds:
        def get_runs(ikis):
            runs = []
            curr_run = 0
            for i, val in enumerate(ikis.values[1:]):
                if val > ms / 1000 or i == len(ikis.values) - 2:
                    if curr_run >= 2:
                        runs.append(curr_run)
                    curr_run = 0
                else:
                    curr_run += 1
            return runs
        runs_series = group.apply(get_runs)
        # count total number of ikis in runs (not number of runs)
        data[f'{ms}ms_run_count'] = runs_series.apply(lambda runs: np.sum(runs)).values
        data[f'{ms}ms_long_run_count'] = runs_series.apply(lambda runs: np.sum([n for n in runs if n >= 10])).values
        # count longest run
        data[f'{ms}ms_max_run'] = runs_series.apply(lambda runs: max(runs)).values
        # count portion of ikis under threshold
        num_ikis = group.apply(lambda x: len([n for n in x.values if n < ms / 1000])).values
        data[f'portion_ikis_under_{ms}ms'] = num_ikis / total_ikis.values
    return data.drop('id', axis=1)
iki_info = get_iki_info(train_logs[['id', 'up_time', 'down_time']])
iki_info.head(3)


Unnamed: 0,longest_iki,median_iki,mean_iki,initial_pause,start_time,100ms_run_count,100ms_long_run_count,100ms_max_run,portion_ikis_under_100ms,300ms_run_count,...,10000ms_max_run,portion_ikis_under_10000ms,30000ms_run_count,30000ms_long_run_count,30000ms_max_run,portion_ikis_under_30000ms,60000ms_run_count,60000ms_long_run_count,60000ms_max_run,portion_ikis_under_60000ms
0,154.136,0.062,0.610944,101.609,4526,1618,245.0,24,0.686742,2065,...,389,0.991396,2549,2549,1338,0.998045,2552,2552,2500,0.999218
1,145.899,0.061,0.620108,1.696,30623,1603,640.0,61,0.689487,2047,...,770,0.988183,2443,2431,1459,0.99674,2449,2449,2321,0.998778
2,153.886,0.04,0.360506,16.736,4441,3333,2300.0,39,0.821809,3794,...,681,0.995164,4123,4123,766,0.99734,4130,4130,2528,0.999033


In [203]:
# Establish baseline model prediction

X, y = iki_info, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.5364813295373154
MSQE: validation set
0.7253544112320706


Next, extract specific fragments in the writing that may indicate either close attention to detail, complex grammar/punctuation/style or, alternatively, carelessness

In [243]:
import re

patterns = [
    # good patterns:
    r'[\.|\?|\!] {2}[^ ]', # two spaces after period
    r'[q|\'|\"]--[q|\'|\"]', # em-dash
    r'[q|\'|\"] -- [q|\'|\"]', # AP style guide em-dash
    r'[q|\'|\"]--[q|\'|\"][^\.]+[q|\'|\"]--[q|\'|\"][^\.]', # em-dash-separated sub-clause within sentence
    r'[q|\'|\"] -- [q|\'|\"][^\.]+[q|\'|\"] -- [q|\'|\"][^\.]',
    r'q-q', # hyphenated words
    r'q\'', # posessives or contraction
    r'\.\.\.', # elipses
    # bad patterns:
    r'[q|,|\'|\"|-]  +[q|,|\'|\"|-]', # two or more spaces in middle of sentence
    r'[ |\n][\.|\?|\!|,|;|:|\(|\)|\'][ |\n]', # punctuation surrounded by whitespace
    r' \n', # unnecessary trailing space before line break
    r'q\n' # word character immediately followed by linebreak
]

def extract_patterns(essays_df):
   
    group = essays_df.groupby('id')['essay']

    data =  pd.DataFrame({
        'id': essays_df['id'].unique(),
    }).reset_index(drop=True)
    def apply_pattern_to_group(pattern):
        def find_pattern(g):
            essay = g.values[0]
            m = re.findall(pattern, essay)
            return len(m)
        return find_pattern
    for i, pattern in enumerate(patterns):
        col = f'pattern_{i}'
        data[col] = group.apply(apply_pattern_to_group(pattern)).values
    return data.drop('id', axis=1)
pattern_info = extract_patterns(train_essays)
pattern_info.head(2)



Unnamed: 0,pattern_0,pattern_1,pattern_2,pattern_3,pattern_4,pattern_5,pattern_6,pattern_7,pattern_8,pattern_9,pattern_10,pattern_11
0,10,0,0,0,0,0,1,0,1,1,1,1
1,0,0,0,0,0,1,2,0,2,1,4,0


In [244]:
# Establish baseline model prediction

X, y = pattern_info, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.8011612792925569
MSQE: validation set
0.8202367315364079


In [277]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)
def iqr(x):
    return x.quantile(0.75) - x.quantile(.25)

AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, iqr, 'skew', pd.Series.kurtosis, 'sum']

def analyze_sentences(essay_df):
    sen_df = pd.DataFrame()
    sen_df['id'] = essay_df['id']
    sen_df['sentence'] = essay_df['essay'].apply(lambda x: re.split(r'\.|\?|\!|\n',str(x)))
    sen_df = sen_df.explode('sentence')
    sen_df['sentence'] = sen_df['sentence'].apply(lambda x: x.replace('\n','').strip())
    sen_df['snt_len'] = sen_df['sentence'].apply(lambda x: len(x))
    sen_df['snt_q_count'] = sen_df['sentence'].apply(lambda x: x.count('q'))
    sen_df = sen_df[sen_df['snt_q_count'] > 0]
    sen_df['snt_word_count'] = sen_df['sentence'].apply(lambda x: len(x.split(' ')))
    data =  data =  pd.DataFrame({
        'id': essay_df['id'].unique(),
    }).reset_index(drop=True)
    for cat in ['snt_len', 'snt_q_count', 'snt_word_count']:
        aggs = AGGREGATIONS if cat == 'snt_len' else AGGREGATIONS[1:]
        for agg in aggs:
            agg_name = str(agg)
            if agg_name[0] == '<':
                agg_name = agg_name.split(' ')[1]
            col = f'{cat}__{agg_name}'
            data[col] = sen_df.groupby('id')[cat].agg(agg).values
    return data.rename(columns={'snt_len__count':'sentence__count'}).drop('id', axis=1)

def analyze_paragraphs(essay_df):
    par_df = pd.DataFrame()
    par_df['id'] = essay_df['id']
    par_df['p'] = essay_df['essay'].apply(lambda x: str(x).split('\n'))
    par_df = par_df.explode('p')
    for ch in ['.', '(', ')', ',', '\'', '"', ':', ';', '?', '!', ' -', '--']:
        par_df['p'] = par_df['p'].apply(lambda x: x.replace(ch,'').strip())
    par_df['par_len'] = par_df['p'].apply(lambda x: len(x))
    par_df['par_q_count'] = par_df['p'].apply(lambda x: x.count('q'))
    par_df = par_df[par_df['par_q_count'] > 0]
    par_df['par_word_count'] = par_df['p'].apply(lambda x: len(x.split(' ')))
    data =  data =  pd.DataFrame({
        'id': essay_df['id'].unique(),
    }).reset_index(drop=True)
    for cat in ['par_len', 'par_q_count', 'par_word_count']:
        aggs = AGGREGATIONS if cat == 'par_len' else AGGREGATIONS[1:]
        for agg in aggs:
            agg_name = str(agg)
            if agg_name[0] == '<':
                agg_name = agg_name.split(' ')[1]
            col = f'{cat}__{agg_name}'
            data[col] = par_df.groupby('id')[cat].agg(agg).values
    return data.rename(columns={'par_len__count':'paragraph__count'}).drop('id', axis=1)

def analyze_words(essay_df):
    word_df = pd.DataFrame()
    word_df['id'] = essay_df['id']
    word_df['word'] = essay_df['essay'].apply(lambda x: str(x).split())
    word_df = word_df.explode('word')
    word_df['word_q_count'] = word_df['word'].apply(lambda x: x.count('q'))
    word_df = word_df[word_df['word_q_count'] > 0]
    data =  data =  pd.DataFrame({
        'id': essay_df['id'].unique(),
    }).reset_index(drop=True)
    for agg in AGGREGATIONS:
        agg_name = str(agg)
        if agg_name[0] == '<':
            agg_name = agg_name.split(' ')[1]
        col = f'word_q_count__{agg_name}'
        data[col] = word_df.groupby('id')['word_q_count'].agg(agg).values
    return data.rename(columns={'word_q_count__count':'word__count'}).drop('id', axis=1)

agg_info = pd.concat([
    analyze_sentences(train_essays), 
    analyze_paragraphs(train_essays), 
    analyze_words(train_essays)
    ], axis=1).fillna(0)
agg_info.shape

(2471, 101)

In [292]:
# Establish baseline model prediction

X, y = agg_info, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.45476942843953505
MSQE: validation set
0.6516732347351654


Get count data for down events, text changes and activities

In [360]:
down_events = train_logs.groupby('down_event')['event_id'].agg('count').sort_values(ascending=False).index[:50]
voc = [tc for tc in train_logs.text_change.unique() if 'q' not in tc and '=>' not in tc and tc not in down_events]
text_changes = train_logs[train_logs.text_change.isin(voc)].groupby('text_change')['event_id'].agg('count').sort_values(ascending=False).index[:4]
activities = train_logs.groupby('activity')['event_id'].agg('count').sort_values(ascending=False).index[:5]
activities, down_events, text_changes

(Index(['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste'], dtype='object', name='activity'),
 Index(['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick',
        'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', ''',
        'Delete', 'Unidentified', 'Control', '"', '-', '?', ';', '=', 'Tab',
        '/', 'Rightclick', ':', '(', ')', '\', 'ContextMenu', 'End', '!',
        'Meta', 'Alt', '[', 'c', 'v', 'NumLock', 'Insert', 'Home', 'z',
        'AudioVolumeDown', 'F2', 'a', 'x', 'AudioVolumeUp', '$', '>', ']', '*'],
       dtype='object', name='down_event'),
 Index([' ', 'NoChange', '\n', '. '], dtype='object', name='text_change'))

In [368]:
def get_counts(logs_df):
    data = pd.DataFrame()
    data['id'] = logs_df['id'].unique().tolist()
    g_total_events = logs_df.groupby('id')['event_id'].agg('count').values
    for event in down_events:
        d = logs_df.groupby('id')['down_event'].value_counts().unstack(fill_value=0)
        if event in d.columns:
            g = d.loc[:,event].tolist()
            data[f'down_event_{event}_count'] = g / g_total_events
    for text_change in text_changes:
        text_change = text_change if text_change != '. ' else '.'
        d = logs_df.groupby('id')['text_change'].value_counts().unstack(fill_value=0)
        if text_change in d.columns:
            g = d.loc[:,text_change].tolist()
            data[f'text_change_{text_change}_count'] = g / g_total_events
    for activity in activities:
        d = logs_df.groupby('id')['activity'].value_counts().unstack(fill_value=0)
        if activity in d.columns:
            g = d.loc[:,activity].tolist()
            data[f'activity_{activity}_count'] = g / g_total_events
    return data.drop('id', axis=1)
other_counts_info = get_counts(train_logs)

In [369]:
# Establish baseline model prediction

X, y = other_counts_info, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.5727571744193612
MSQE: validation set
0.7224463327320648


Now examine changes in word count, time and cursor position after different numbers of events

In [386]:
gaps = [2, 5, 10, 20, 50, 100, 200, 500]

def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)
def iqr(x):
    return x.quantile(0.75) - x.quantile(.25)

AGGREGATIONS = ['mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, iqr, 'skew', pd.Series.kurtosis, 'sum']

metrics = ['down_time', 'cursor_position', 'word_count']

def get_gaps_info(logs_df):
    data = pd.DataFrame()
    data['id'] = logs_df['id'].unique().tolist()
    temp = logs_df.copy()
    for metric in metrics:
        for gap in gaps:
            temp[f'gap_{gap}_{metric}'] = temp[metric] - temp.groupby('id')[metric].shift(gap)
            for agg in AGGREGATIONS:
                agg_name = str(agg)
                if agg_name[0] == '<':
                    agg_name = agg_name.split(' ')[1]
                data[f'gap_{gap}_{metric}__{agg_name}'] = temp.groupby('id')[f'gap_{gap}_{metric}'].agg(agg).values
    return data.drop('id', axis=1)
gap_info = get_gaps_info(train_logs)
gap_info = gap_info.fillna(0)
gap_info.head(3)

Unnamed: 0,gap_2_down_time__mean,gap_2_down_time__std,gap_2_down_time__min,gap_2_down_time__max,gap_2_down_time__first,gap_2_down_time__last,gap_2_down_time__sem,gap_2_down_time__q1,gap_2_down_time__median,gap_2_down_time__q3,...,gap_500_word_count__first,gap_500_word_count__last,gap_500_word_count__sem,gap_500_word_count__q1,gap_500_word_count__median,gap_500_word_count__q3,gap_500_word_count__iqr,gap_500_word_count__skew,gap_500_word_count__Series.kurt,gap_500_word_count__sum
0,1399.497065,6198.512184,46.0,162229.0,102045.0,19815.0,122.628667,250.0,344.0,674.5,...,42.0,46.0,0.214883,42.0,54.0,60.0,18.0,-0.246747,-1.263949,106495.0
1,1433.109706,7173.079166,53.0,155377.0,2083.0,115062.0,144.858969,256.0,344.0,616.0,...,82.0,36.0,0.287947,62.0,65.0,83.0,21.0,0.137884,-0.62187,135714.0
2,850.31882,5817.295774,67.0,154879.0,16894.0,1602.0,90.476524,161.0,196.0,284.0,...,48.0,62.0,0.282312,42.0,52.0,61.0,19.0,-1.263675,0.76293,175993.0


In [387]:
# Establish baseline model prediction

X, y = gap_info, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.44476798525890426
MSQE: validation set
0.6864302451195533


In [404]:
feature_dfs = [
    ('pattern info', pattern_info),
    ('n-gram tdidf', train_ngram_tdidf),
    ('iki info', iki_info),
    ('agg ingo', agg_info),
    ('other counts info', other_counts_info),
    ('gap info', gap_info)
]
feature_importance_dfs = []
for name, df in feature_dfs:
    models = []
    train_msqe_scores = []
    valid_msqe_scores = []
    for rs in [1, 2, 3, 4, 5]:
        X, y = df, train_scores['score']
        X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)
        model = GradientBoostingRegressor()
        model.fit(X_train, y_train)
        y_predict_train = model.predict(X_train)
        y_predict_valid = model.predict(X_valid)
        models.append(model)
        train_msqe_scores.append(mean_squared_error(y_train, y_predict_train, squared=False))
        valid_msqe_scores.append(mean_squared_error(y_valid, y_predict_valid, squared=False))
    print(f'{name} -- train score: {round(np.mean(train_msqe_scores), 3)}, valid score: {round(np.mean(valid_msqe_scores), 3)}')
    feature_importances_values = np.asarray([model.feature_importances_ for model in models]).mean(axis=0)
    feature_importance_df = pd.DataFrame({'name': df.columns, 'importance': feature_importances_values})
    feature_importance_dfs.append((name, feature_importance_df.sort_values('importance', ascending=False)))

pattern info -- train score: 0.792, valid score: 0.856
n-gram tdidf -- train score: 0.405, valid score: 0.693
iki info -- train score: 0.536, valid score: 0.714
agg ingo -- train score: 0.459, valid score: 0.656
other counts info -- train score: 0.568, valid score: 0.745
gap info -- train score: 0.439, valid score: 0.671


pattern info: 10
n-gram tdidf: 37
iki info: 27
agg info: 43
other counts info: 20
gap info: 37

In [440]:
feature_numbers = [10, 37, 27, 43, 20, 37]
train_feats = pd.DataFrame()
for i in range(len(feature_numbers)):
    cols = feature_importance_dfs[i][1]['name'][:feature_numbers[i]]
    for col in cols:
        train_feats[col] = feature_dfs[i][1][col]
train_feats.shape

(2471, 174)

In [470]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import f_regression

to_drop = [
    'snt_q_count__sum',
    'par_q_count__sum',
    'par_len__sum',
    'gap_200_word_count__sum',
    'gap_100_word_count__sum',
    'gap_50_word_count__sum',
    'gap_2_word_count__sum',
    'word__count',
    'gap_20_word_count__sum',
    'gap_10_word_count__sum',
    'gap_5_word_count__sum',
    '1000ms_long_run_count',
    '300ms_long_run_count',
    '500ms_run_count',
    '1000ms_run_count',
    '100ms_run_count',
    'portion_ikis_under_300ms',
    'portion_ikis_under_1000ms',
    'portion_ikis_under_100ms',
    'portion_ikis_under_2000ms',
    'portion_ikis_under_10000ms',
    'portion_ikis_under_60000ms',
    'par_len__mean',
    'snt_len__sum',
    'gap_2_down_time__iqr',
    'gap_2_down_time__q1',
    'par_word_count__mean',
    'par_q_count__q1',
    'text_change_._count'

]
tf2 = train_feats.copy()
tf2 = tf2.drop(to_drop, axis=1)
data = pd.DataFrame()
data['features'] = tf2.columns
data['VIF'] = [variance_inflation_factor(tf2, i).round(1) for i in range(tf2.shape[1])]
f_scores, p_scores = f_regression(tf2, train_scores['score'])
data['Fscores'] = f_scores.round(0)
data['Pscores'] = p_scores.round(5)
to_drop = data.sort_values(by='Fscores', ascending=False)['features'][100:]
tf3 = tf2.drop(to_drop, axis=1)


In [472]:
# Establish baseline prediction model

X, y = tf3, train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.42343974696801734
MSQE: validation set
0.6270746487347747


In [492]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

X, y = tf3.rename(columns={x:y for x,y in zip(tf3.columns,range(0,len(tf3.columns)))}), train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)

params = {
    'application': 'binary', # for binary classification
    'boosting': 'gbdt', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.05,
    'num_leaves': 62,
    'max_depth': -1, # <0 means no limit
    'max_bin': 510, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 5, # L1 regularization
    'lambda_l2': 10, # L2 regularization
    'metric' : 'binary_error',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5# minimum number of data needed in a leaf
}
model = GradientBoostingRegressor()
grid = {}
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]
gs = GridSearchCV(estimator=model, 
                           param_grid=grid,
                           scoring='neg_root_mean_squared_error',
                           verbose=2,
                           cv=5)
gs.fit(X_train, y_train)
results = gs.cv_results_
print('Best Score:', -gs.best_score_, 'Best Params:', gs.best_params_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5; total time=   0.3s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5; total time=   0.2s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5; total time=   0.2s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5; total time=   0.3s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5; total time=   0.6s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.7; total time=   0.3s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.7; total time=   0.7s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.7; total time=   0.4s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.7; total time=   0.3s
[CV] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.7; total 

In [499]:
# Establish baseline prediction model

X, y = tf3.rename(columns={x:y for x,y in zip(tf3.columns,range(0,len(tf3.columns)))}), train_scores['score']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=rs)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=rs)
params = {
    'n_estimators': 1000,
    'learning_rate': .01,
    'max_depth' : 3,
    'subsample' : .5}
model = GradientBoostingRegressor(**params)
model.fit(X_train, y_train)
y_predict_train = model.predict(X_train)
y_predict_valid = model.predict(X_valid)
print('MSQE: training set')
print(mean_squared_error(y_train, y_predict_train, squared=False))
print('MSQE: validation set')
print(mean_squared_error(y_valid, y_predict_valid, squared=False))

MSQE: training set
0.4130375106429742
MSQE: validation set
0.6108567957677844
