# The Key to Life

## Imports and Global Variables

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

TEST_SIZE = 0.33
LABELS = ["Brian", "Eleanor"]
FILES = ["brian_keylogs_clean.csv", "eleanor_keylogs_clean.csv"]

## Featurization and Train/Test Split (DO NOT TOUCH)

In [4]:
def featurize_that_bish(df, top_pairs=None):
    data = pd.DataFrame(df.groupby("pair")['delta'].mean().reset_index())
    if top_pairs:
        data = data.where(data["pair"].isin(top_pairs)).dropna()
    data.columns = ["pair", "delta_avg"]
    data = data.reset_index()
    del data["index"]
    data = data.sort_values("pair", ascending=False)
    data = data.transpose()
    return data

def combine_data(a, b):
    new = pd.DataFrame(a)
    new.append(b, axis=1)
    return new

def create_matrix(lst):
    mat = lst[0]
    for mat1 in lst[1:]:
        mat = mat.append(mat1)
    return np.asmatrix(mat)

def get_top_pairs():
    return set(dataframes[0].groupby("pair").count().sort_values("delta", ascending=False).reset_index()[:100]['pair'])

def get_train_test(dataframes):
    top_pairs = get_top_pairs()
    train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
    featurized = [(featurize_that_bish(pair[0], top_pairs), featurize_that_bish(pair[1], top_pairs)) for pair in train_test]
    training_lst = [pair[0].iloc[1:] for pair in featurized]
    test_lst = [pair[1].iloc[1:] for pair in featurized]
    train = np.asmatrix(pd.DataFrame(create_matrix(training_lst)).fillna(0))
    test = np.asmatrix(pd.DataFrame(create_matrix(test_lst)).fillna(0))
    return train, test

def print_probabilities(model, test):
    result = list(model.predict_proba(test))
    print(LABELS)
    for arr in result:
        print(arr)

In [5]:
dataframes = [shuffle(pd.read_csv(file)) for file in FILES]
train, test = get_train_test(dataframes)

## Logistic Regression

In [7]:
log = LogisticRegression(penalty='l2')
log.fit(train, LABELS)
print(print_probabilities(log, test))

['Brian', 'Eleanor']
[ 0.50659219  0.49340781]
[ 0.47710972  0.52289028]
None


## SVM

In [11]:
top_pairs = get_top_pairs()
train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
featurized = [(featurize_that_bish(pair[0], top_pairs), featurize_that_bish(pair[1], top_pairs)) for pair in train_test]
featurized[0]

(                            99                       98             97  \
 pair       Key.space,Key.shift  Key.space,Key.backspace  Key.space,'w'   
 delta_avg             0.328646                 0.613305       0.225602   
 
                       96             95             94             93  \
 pair       Key.space,'t'  Key.space,'s'  Key.space,'r'  Key.space,'p'   
 delta_avg       0.216671       0.235099       0.244429       0.252324   
 
                       92             91             90      ...        \
 pair       Key.space,'o'  Key.space,'m'  Key.space,'l'      ...         
 delta_avg       0.261095       0.307238       0.271035      ...         
 
                  9         8              7         6         5         4   \
 pair        'c','a'   'b','e'  'a',Key.space   'a','t'   'a','s'   'a','r'   
 delta_avg  0.135022  0.172133       0.128899  0.140291  0.119371  0.138392   
 
                  3         2        1              0   
 pair        'a','n'   'a','l

## Random Forests

In [189]:
boost = AdaBoostClassifier()
boost.fit(train, labels)
print(print_probabilities(boost, test))

['Brian', 'Eleanor']
[  1.00000000e+00   2.22044605e-16]
[  1.00000000e+00   2.22044605e-16]
None
