# The Key to Life

## Imports and Global Variables

In [14]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

TEST_SIZE = 0.33
LABELS = ["Brian", "Eleanor", "Walt"]
FILES = ["brian_keylogs_clean.csv", "eleanor_keylogs_clean.csv", "walt_combined_clean.csv"]

## Featurization and Train/Test Split (DO NOT TOUCH)

In [17]:
def featurize_that_bish(df, n, top_pairs=[], frac = 0.01):
    final = [pd.DataFrame(columns=top_pairs)]
    for i in range(n):
        rand = df.sample(frac=frac)
        data = pd.DataFrame(rand.groupby("pair")['delta'].mean().reset_index()) # random sample here
        if top_pairs:
            data = data.where(data["pair"].isin(top_pairs)).dropna()
        data.columns = ["pair", "delta_avg"]
        data = data.reset_index()
        del data["index"]
        data = data.sort_values("pair", ascending=False)
        data = data.transpose()
        data = pd.DataFrame(data)
        data.columns = data.iloc[0]
        data = data.drop(data.index[0])
        final.append(data)
    ret = pd.DataFrame(create_matrix(final)).fillna(0)
    return ret

def combine_data(a, b):
    new = pd.DataFrame(a)
    new.append(b, axis=1)
    return new

def create_matrix(lst):
    mat = lst[0]
    for mat1 in lst[1:]:
        mat = mat.append(mat1)
    return np.asmatrix(mat)

def get_top_pairs():
    return set(dataframes[0].groupby("pair").count().sort_values("delta", ascending=False).reset_index()[:100]['pair'])

def get_train_test_labels(dataframes, n = 1000):
    top_pairs = get_top_pairs()
    train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
    featurized = [(featurize_that_bish(pair[0], n, top_pairs), featurize_that_bish(pair[1], n, top_pairs)) for pair in train_test]
    training_lst = [pair[0] for pair in featurized]
    test_lst = [pair[1] for pair in featurized]
    train = np.asmatrix(pd.DataFrame(create_matrix(training_lst)).fillna(0))
    test = np.asmatrix(pd.DataFrame(create_matrix(test_lst)).fillna(0))
    labels = get_labels(n)
    return train, test, labels

def get_labels(n):
    labels = []
    for label in LABELS:
        labels.extend([label for _ in range(n)])
    return labels

def print_probabilities(model, test):
    result = list(model.predict_proba(test))
    print(LABELS)
    for arr in result:
        print(arr)
        
def create_metrics(model, n=100):
    confusion_matrix = np.zeros((len(LABELS), len(LABELS)))
    def create_confusion_matrix(predictions):
        for i in range(len(predictions)):
            j = LABELS.index(predictions[i])
            confusion_matrix[i,j] += 1

    for i in range(n):
        dataframes = [shuffle(pd.read_csv(file)) for file in FILES]
        train, test, labels = get_train_test_labels(dataframes)
        clf = model()
        clf.fit(train, labels)
        predictions = clf.predict(test)
        create_confusion_matrix(predictions)
    return confusion_matrix

In [18]:
dataframes = [shuffle(pd.read_csv(file)) for file in FILES]
train, test, labels = get_train_test_labels(dataframes)

## Logistic Regression

In [11]:
# log = LogisticRegression(penalty='l2')
# log.fit(train, labels)
# print(print_probabilities(log, test))

## SVM

In [10]:
# top_pairs = get_top_pairs()
# train_test = [train_test_split(df, test_size = TEST_SIZE) for df in dataframes]
# featurized = [(featurize_that_bish(pair[0], top_pairs), featurize_that_bish(pair[1], top_pairs)) for pair in train_test]
# featurized[0]

## Random Forests

In [20]:
clf = LogisticRegression() #AdaBoostClassifier()
clf.fit(train, labels)
pred = clf.predict(test)
print(classification_report(labels, pred))

             precision    recall  f1-score   support

      Brian       0.97      0.59      0.74      1000
    Eleanor       0.75      0.63      0.68      1000
       Walt       0.59      0.92      0.72      1000

avg / total       0.77      0.71      0.71      3000



# Metrics

In [9]:
create_metrics(LogisticRegression, n = 1)

IndexError: index 3 is out of bounds for axis 0 with size 3