# The Key to Life

## Imports and Global Variables

In [11]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

TEST_SIZE = 0.33
LABELS = ["Brian", "Eleanor", "Walt"]
FILES = ["brian_keylogs_clean.csv", "eleanor_keylogs_clean.csv", "walt_combined_clean.csv"]

## Featurization and Train/Test Split (DO NOT TOUCH)

In [176]:
def featurize_that_bish(df, n, top_pairs=[], frac = 0.01):
    final = [pd.DataFrame(columns=top_pairs)]
    for i in range(n):
        rand = df.sample(frac=frac)
        data = pd.DataFrame(rand.groupby("pair")['delta'].mean().reset_index()) # random sample here
        if top_pairs:
            data = data.where(data["pair"].isin(top_pairs)).dropna()
        data.columns = ["pair", "delta_avg"]
        data = data.reset_index()
        del data["index"]
        data = data.sort_values("pair", ascending=False)
        data = data.transpose()
        data = pd.DataFrame(data)
        data.columns = data.iloc[0]
        data = data.drop(data.index[0])
        final.append(data)
    ret = pd.DataFrame(create_matrix(final)).fillna(0)
    return ret

def combine_data(a, b):
    new = pd.DataFrame(a)
    new.append(b, axis=1)
    return new

def create_matrix(lst):
    mat = lst[0]
    for mat1 in lst[1:]:
        mat = mat.append(mat1)
    return np.asmatrix(mat)

def get_top_pairs():
    return set(dataframes[0].groupby("pair").count().sort_values("delta", ascending=False).reset_index()[:100]['pair'])

def get_train_test_labels(dataframes, n = 2):
    top_pairs = get_top_pairs()
    train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
    featurized = [(featurize_that_bish(pair[0], n, top_pairs), featurize_that_bish(pair[1], n, top_pairs)) for pair in train_test]
    training_lst = [pair[0] for pair in featurized]
    test_lst = [pair[1] for pair in featurized]
    train = np.asmatrix(pd.DataFrame(create_matrix(training_lst)).fillna(0))
    test = np.asmatrix(pd.DataFrame(create_matrix(test_lst)).fillna(0))
    labels = get_labels(n)
    return train, test, labels

def get_labels(n):
    labels = []
    for label in LABELS:
        labels.extend([label for _ in range(n)])
    return labels

def print_probabilities(model, test):
    result = list(model.predict_proba(test))
    print(LABELS)
    for arr in result:
        print(arr)

In [177]:
dataframes = [shuffle(pd.read_csv(file)) for file in FILES]
train, test, labels = get_train_test_labels(dataframes)

## Logistic Regression

In [178]:
labels

['Brian', 'Brian', 'Eleanor', 'Eleanor', 'Walt', 'Walt']

In [179]:
log = LogisticRegression(penalty='l2')
log.fit(train, labels)
print(print_probabilities(log, test))

['Brian', 'Eleanor', 'Walt']
[ 0.36961281  0.33582337  0.29456382]
[ 0.361234    0.27640089  0.36236511]
[ 0.37833692  0.31620379  0.3054593 ]
[ 0.30798607  0.32860233  0.3634116 ]
[ 0.32214953  0.33165515  0.34619532]
[ 0.33921226  0.33836904  0.3224187 ]
None


## SVM

In [180]:
top_pairs = get_top_pairs()
train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
featurized = [(featurize_that_bish(pair[0], top_pairs), featurize_that_bish(pair[1], top_pairs)) for pair in train_test]
featurized[0]

TypeError: 'set' object cannot be interpreted as an integer

## Random Forests

In [6]:
boost = AdaBoostClassifier()
boost.fit(train, labels)
boost.predict(test)
np.set_printoptions(suppress=True)
boost.predict_proba(test)

array([[ 0.99926188,  0.00055403,  0.00018409],
       [ 0.00141356,  0.99772649,  0.00085995],
       [ 0.00025498,  0.00000019,  0.99974484]])

In [None]:
from sklearn.metrics import confusion_matrix
y_actu = 
y_pred = 
confusion_matrix(y_actu, y_pred)
