# The Key to Life

## Imports and Global Variables

In [8]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

TEST_SIZE = 0.33
LABELS = ["Brian", "Eleanor", "Walt"]
FILES = ["brian_keylogs_clean.csv", "eleanor_keylogs_clean.csv", "walt_combined_clean.csv"]

## Featurization and Train/Test Split (DO NOT TOUCH)

In [9]:
def featurize_that_bish(df, top_pairs=None):
    data = pd.DataFrame(df.groupby("pair")['delta'].mean().reset_index())
    if top_pairs:
        data = data.where(data["pair"].isin(top_pairs)).dropna()
    data.columns = ["pair", "delta_avg"]
    data = data.reset_index()
    del data["index"]
    data = data.sort_values("pair", ascending=False)
    data = data.transpose()
    return data

def combine_data(a, b):
    new = pd.DataFrame(a)
    new.append(b, axis=1)
    return new

def create_matrix(lst):
    mat = lst[0]
    for mat1 in lst[1:]:
        mat = mat.append(mat1)
    return np.asmatrix(mat)

def get_top_pairs():
    return set(dataframes[0].groupby("pair").count().sort_values("delta", ascending=False).reset_index()[:100]['pair'])

def get_train_test(dataframes):
    top_pairs = get_top_pairs()
    train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
    featurized = [(featurize_that_bish(pair[0], top_pairs), featurize_that_bish(pair[1], top_pairs)) for pair in train_test]
    training_lst = [pair[0].iloc[1:] for pair in featurized]
    test_lst = [pair[1].iloc[1:] for pair in featurized]
    train = np.asmatrix(pd.DataFrame(create_matrix(training_lst)).fillna(0))
    test = np.asmatrix(pd.DataFrame(create_matrix(test_lst)).fillna(0))
    return train, test

def print_probabilities(model, test):
    result = list(model.predict_proba(test))
    print(LABELS)
    for arr in result:
        print(arr)

In [10]:
dataframes = [shuffle(pd.read_csv(file)) for file in FILES]
train, test = get_train_test(dataframes)

## Logistic Regression

In [11]:
log = LogisticRegression(penalty='l2')
log.fit(train, LABELS)
print(print_probabilities(log, test))

['Brian', 'Eleanor', 'Walt']
[ 0.38289894  0.34916268  0.26793838]
[ 0.37036416  0.38082694  0.2488089 ]
[ 0.31473201  0.31423442  0.37103357]
None


## SVM

In [12]:
top_pairs = get_top_pairs()
train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
featurized = [(featurize_that_bish(pair[0], top_pairs), featurize_that_bish(pair[1], top_pairs)) for pair in train_test]
featurized[0]

(                            99                       98             97  \
 pair       Key.space,Key.shift  Key.space,Key.backspace  Key.space,'w'   
 delta_avg             0.330394                 0.623971        0.24462   
 
                       96             95             94             93  \
 pair       Key.space,'t'  Key.space,'s'  Key.space,'r'  Key.space,'p'   
 delta_avg        0.21059       0.253325       0.260316        0.27709   
 
                       92             91             90      ...        \
 pair       Key.space,'o'  Key.space,'m'  Key.space,'l'      ...         
 delta_avg       0.237001       0.275468       0.282835      ...         
 
                  9         8              7         6         5         4   \
 pair        'c','a'   'b','e'  'a',Key.space   'a','t'   'a','s'   'a','r'   
 delta_avg  0.133903  0.168773       0.140696  0.133799  0.113459  0.137642   
 
                  3         2         1              0   
 pair        'a','n'   'a','

## Random Forests

In [13]:
boost = AdaBoostClassifier()
boost.fit(train, LABELS)
boost.predict(test)
np.set_printoptions(suppress=True)
boost.predict_proba(test)

array([[ 0.99997529,  0.00000002,  0.00002469],
       [ 0.01972599,  0.98023097,  0.00004304],
       [ 0.0000345 ,  0.00001185,  0.99995365]])