# The Key to Life

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

TEST_SIZE = 0.33

In [2]:
df = pd.read_csv("brian_keylogs_clean.csv")
el = pd.read_csv("eleanor_keylogs_clean.csv")

In [74]:
def featurize_that_bish(df, top_pairs=None):
    data = pd.DataFrame(df.groupby("pair")['delta'].median().reset_index())
    if top_pairs:
        data = data.where(data["pair"].isin(top_pairs)).dropna()
    data.columns = ["pair", "delta_avg"]
    data = data.reset_index()
    del data["index"]
    data = data.sort_values("pair", ascending=False)
    return data.transpose()

def split_data(data, test_size = 0.33, top_pairs=None):
    data = shuffle(data)
    train, test = train_test_split(data, test_size=test_size)
    return featurize_that_bish(train, top_pairs), featurize_that_bish(test, top_pairs)

def combine_data(a, b):
    new = pd.DataFrame(a)
    new.append(b, axis=1)
    return new

def create_regression_parameters(train_list, test_list, labels):
    y = labels #np.array(list(range(len(train_list))))
    train = train_list[0]
    test = test_list[0]
    for i in range(1, len(train_list)):
        train = train.append(train_list[i])
        test = test.append(test_list[i])
    return np.asmatrix(train), np.asmatrix(test), y

In [75]:
top_pairs = set(["'t','h'", "'a','r'", "'h','e'", "'t','e'", "'a','n'", "'s','e'", "'i','n'", "'m','e'", "'e','r'", "'s','a'", "'n','d'", "'n','e'", "'r','e'", "'w','a'", "'e','d'", "'v','e'", "'e','s'", "'l','e'", "'o','u'", "'n','o'", "'t','o'", "'t','a'", "'h','a'", "'a','l'", "'e','n'", "'d','e'", "'e','a'", "'o','t'", "'s','t'", "'s','o'", "'n','t'", "'d','t'", "'o','n'", "'l','l'", "'a','t'", "'t','t'", "'h','i'", "'e','l'", "'a','s'", "'r','o'", "'i','t'", "'a','d'", "'n','g'", "'d','i'", "'i','s'", "'e','w'", "'o','r'", "'r','a'", "'e','t'", "'r','i'", "'o','f'", "'s','h'", "'t','i'" ])
print(top_pairs)
bf_train, bf_test = split_data(df, TEST_SIZE, top_pairs=top_pairs)
el_train, el_test = split_data(el, TEST_SIZE, top_pairs=top_pairs)

{"Key.space,'f'", "'e','a'", "'o','f'", "'n','e'", "'h',Key.space", "'l','a'", "'s','e'", "'d','d'", "'l',Key.space", "'e','n'", "'c','t'", "'e','s'", 'Key.space,Key.backspace', "'.',Key.space", "'o','r'", "'l','i'", "Key.space,'s'", "'t','o'", "Key.shift,'('", "'c','h'", "'n','t'", "'p','r'", "Key.space,'h'", "Key.space,'o'", "'v','e'", 'Key.left,Key.left', 'Key.right,Key.right', "'y',Key.space", "'c','a'", "'e','t'", "'i','o'", "'e','r'", "'a','t'", "'n','d'", "'c','o'", "Key.space,'m'", "'g',Key.space", "'h','i'", "Key.space,'r'", "'h','e'", "'r','a'", 'Key.backspace,Key.shift', "Key.space,'d'", "'m','e'", "'e','c'", "'a','l'", "'e','d'", "'t','i'", "Key.space,'t'", "'d','e'", "'o','u'", "'r','e'", "'r','o'", "'m','a'", "'r',Key.space", "'n','g'", "'i','t'", "Key.space,'b'", "'s',Key.space", "'e','e'", "Key.space,'i'", "'f',Key.space", "'d',Key.space", "'t',Key.space", "'o','n'", "'a','s'", "Key.space,'l'", "'l','l'", "Key.space,'w'", "'d','i'", "'l','e'", "Key.space,'c'", "'s',Key.

In [76]:
labels = ["El", "Brian"]
train_list = [el_train.iloc[1:], bf_train.iloc[1:]]
test_list = [el_test.iloc[1:], bf_test.iloc[1:]]
train, test, y = create_regression_parameters(train_list, test_list, labels)

print(train_list[0])

                 99        98        97        96        95        94  \
delta_avg  0.682778  0.571518  0.130982  0.127899  0.124174  0.154627   

                 93        92        91        90    ...           9   \
delta_avg  0.201946  0.139849  0.164586  0.171888    ...     0.096216   

                  8         7        6         5         4         3   \
delta_avg  0.0916786  0.114861  0.14311  0.136896  0.134102  0.083998   

                 2         1         0   
delta_avg  0.105743  0.171842  0.270184  

[1 rows x 100 columns]


In [77]:
log = LogisticRegression()
log.fit(train, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [81]:
log.predict_proba(test[0])

array([[ 0.49408496,  0.50591504]])