# The Key to Life

In [95]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

TEST_SIZE = 0.25

In [87]:
df = pd.read_csv("brian_keylogs_clean.csv")
el = pd.read_csv("eleanor_keylogs_clean.csv")

In [88]:
def featurize_that_bish(df, top_pairs=None):
    data = pd.DataFrame(df.groupby("pair")['delta'].median().reset_index())
    if top_pairs:
        data = data.where(data["pair"].isin(top_pairs)).dropna()
    data.columns = ["pair", "delta_avg"]
    data = data.reset_index()
    del data["index"]
    data = data.sort_values("pair", ascending=False)
    return data.transpose()

def split_data(data, test_size = 0.33, top_pairs=None):
    data = shuffle(data)
    train, test = train_test_split(data, test_size=test_size)
    return featurize_that_bish(train, top_pairs), featurize_that_bish(test, top_pairs)

def combine_data(a, b):
    new = pd.DataFrame(a)
    new.append(b, axis=1)
    return new

def create_regression_parameters(train_list, test_list, labels):
    y = labels #np.array(list(range(len(train_list))))
    train = train_list[0]
    test = test_list[0]
    for i in range(1, len(train_list)):
        train = train.append(train_list[i])
        test = test.append(test_list[i])
    return np.asmatrix(train), np.asmatrix(test), y

In [89]:
top_pairs = set(["'t','h'", "'a','r'", "'h','e'", "'t','e'", "'a','n'", "'s','e'", "'i','n'", "'m','e'", "'e','r'", "'s','a'", "'n','d'", "'n','e'", "'r','e'", "'w','a'", "'e','d'", "'v','e'", "'e','s'", "'l','e'", "'o','u'", "'n','o'", "'t','o'", "'t','a'", "'h','a'", "'a','l'", "'e','n'", "'d','e'", "'e','a'", "'o','t'", "'s','t'", "'s','o'", "'n','t'", "'d','t'", "'o','n'", "'l','l'", "'a','t'", "'t','t'", "'h','i'", "'e','l'", "'a','s'", "'r','o'", "'i','t'", "'a','d'", "'n','g'", "'d','i'", "'i','s'", "'e','w'", "'o','r'", "'r','a'", "'e','t'", "'r','i'", "'o','f'", "'s','h'", "'t','i'" ])
print(top_pairs)
bf_train, bf_test = split_data(df, TEST_SIZE, top_pairs=top_pairs)
el_train, el_test = split_data(el, TEST_SIZE, top_pairs=top_pairs)

{"'s','h'", "'t','e'", "'e','a'", "'o','f'", "'n','e'", "'d','t'", "'e','t'", "'s','e'", "'i','s'", "'w','a'", "'e','n'", "'e','r'", "'e','w'", "'h','a'", "'e','s'", "'t','t'", "'a','t'", "'n','d'", "'s','o'", "'a','d'", "'o','t'", "'h','i'", "'o','r'", "'h','e'", "'i','n'", "'o','n'", "'a','s'", "'r','a'", "'m','e'", "'n','o'", "'a','l'", "'l','l'", "'d','i'", "'s','t'", "'l','e'", "'t','o'", "'e','d'", "'t','a'", "'e','l'", "'t','i'", "'t','h'", "'n','t'", "'d','e'", "'a','r'", "'r','i'", "'a','n'", "'v','e'", "'o','u'", "'r','e'", "'s','a'", "'r','o'", "'n','g'", "'i','t'"}


In [90]:
labels = ["El", "Brian"]
train_list = [el_train.iloc[1:], bf_train.iloc[1:]]
test_list = [el_test.iloc[1:], bf_test.iloc[1:]]
train, test, y = create_regression_parameters(train_list, test_list, labels)
print(train_list[0])

                 52         51        50         49         48        47  \
delta_avg  0.114231  0.0893581  0.134956  0.0680568  0.0761049  0.076069   

                  46        45        44         43    ...            9   \
delta_avg  0.0997782  0.108133  0.193713  0.0957849    ...     0.0820171   

                8         7         6         5         4         3   \
delta_avg  1.81702  0.102747  0.166895  0.145564  0.126817  0.133751   

                 2         1         0   
delta_avg  0.085066  0.110223  0.156972  

[1 rows x 53 columns]


In [96]:
log = LogisticRegression()
log.fit(train, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [98]:
log.predict(test[0])

array(['Brian'], 
      dtype='<U5')