# User Authentication Based on Mouse Characteristics #

## Load Packages ##

In [None]:
import pandas as pd
import numpy as np

import os
import pickle
import copy

# preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# algorithms
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier

# optimization
from sklearn.model_selection import RandomizedSearchCV

# performance
from sklearn.metrics import roc_auc_score

from joblib import dump
from joblib import load

# random seed 
np.random.seed(0)

## Load Data ##

In [None]:
data_dir = '/Users/yangyulong/Documents/yzkj/'

In [None]:
all_train = pd.read_pickle(data_dir + 'all_training_aggregation.pickle')

In [None]:
file_paths = []

for root, dirs, files in os.walk(data_dir + "training_files/"):
    for file in files:
        file_paths.append(os.path.join(root, file))

# randomly pick 66% of all training sessions, use these sessions to train classification models
draw_train = np.random.randint(low=0, high=len(file_paths), size=np.floor(len(file_paths)*0.66).astype('int'))
train_users = list(map(lambda x: x.split(os.path.sep)[-2], [file_paths[y] for y in draw_train]))
train_sessions = list(map(lambda x: x.split(os.path.sep)[-1], [file_paths[y] for y in draw_train]))
df_train = all_train[all_train['user'].isin(train_users) & all_train['session'].isin(train_sessions)]

# the rest of the sessions are validation data
draw_val = list(set(range(len(file_paths))) - set(draw_train))
val_users = list(map(lambda x: x.split(os.path.sep)[-2], [file_paths[y] for y in draw_val]))
val_sessions = list(map(lambda x: x.split(os.path.sep)[-1], [file_paths[y] for y in draw_val]))
df_val = all_train[all_train['user'].isin(val_users) & all_train['session'].isin(val_sessions)]

## Process Data ##

In [None]:
le_user = LabelEncoder()
le_categ = LabelEncoder()

oh_user = OneHotEncoder()
oh_categ = OneHotEncoder()

In [None]:
y_train = le_user.fit_transform(df_train['user'])

# label encode
df_train['categ_le'] = le_categ.fit_transform(df_train['categ_agg'])

# one-hot encode
vec_size = df_train['categ_agg'].nunique()
df_train[['oh_categ{}'.format(i) \
          for i in range(vec_size)]] = \
        pd.DataFrame(oh_categ.fit_transform(\
                df_train['categ_le'].values.reshape(len(df_train['categ_le']), 1)).todense(), \
             index=df_train.index)

X_train = df_train.drop(['categ_agg', 'session', 'categ_le', 'user'], axis=1)

In [None]:
y_val = le_user.transform(df_val['user'])

# label encode
df_val['categ_le'] = le_categ.transform(df_val['categ_agg'])

# one-hot encode
df_val[['oh_categ{}'.format(i) \
          for i in range(vec_size)]] = \
        pd.DataFrame(oh_categ.transform(\
                df_val['categ_le'].values.reshape(len(df_val['categ_le']), 1)).todense(), \
             index=df_val.index)

X_val = df_val.drop(['categ_agg', 'session', 'categ_le', 'user'], axis=1)

## Fit Models ##

Define a few classification models. 

In [None]:
clf_lgb = LGBMClassifier(random_state=0)
# clf_xgb = XGBClassifier(random_state=0)
# clf_rf = RandomForestClassifier(random_state=0)
# clf_lr = LogisticRegression(random_state=0)

For a given user in the training sessions, label their mouse actions as legal (`is_illegal`=0). All the other users' mouse actions are labeled illegal (`is_illegal`=1). Loop over all users. 

In [None]:
for user in le_user.classes_:
    df = df_train.copy()
    df['is_illegal'] = 0
    # 1 = illegal session, 0 = legal session 
    df.loc[df['user'] != user, 'is_illegal'] = 1
    X = df.drop(['categ_agg', 'session', 'categ_le', 'user', 'is_illegal'], axis=1)
    y = df['is_illegal']

    exec('clf_lgb_' + user + " = LGBMClassifier(random_state=0)")
    exec('clf_lgb_' + user + ".fit(X, y)")
    
    auc = eval('roc_auc_score(y, clf_lgb_' + user + ".predict_proba(X)[:, 1])")
    dump(eval(f'clf_lgb_{user}'), f'clf_lgb_{user}.joblib')

    print("ROC AUC in training data for {0}: {1:0.4}".format(user, auc))
    
    del df, X, y

In [None]:
for user in le_user.classes_:
    df = df_val.copy()
    df['is_illegal'] = 0
    # 1 = illegal session, 0 = legal session 
    df.loc[df['user'] != user, 'is_illegal'] = 1
    X = df.drop(['categ_agg', 'session', 'categ_le', 'user', 'is_illegal'], axis=1)
    y = df['is_illegal']

    auc = eval('roc_auc_score(y, clf_lgb_' + user + ".predict_proba(X)[:, 1])")

    print("ROC AUC in validation data for {0}: {1:0.4}".format(user, auc))
    
    del df, X, y

For users other than 7 and 9 we seem to have overfit to training data. Now use cross-validation to correct overfitting.

In [None]:
# candidate hyperparameters
gridParams = {
    'num_leaves': [6, 8, 12, 16, 24],
    'min_data_in_leaf': [24, 32, 40], 
    'max_bin': [32, 64, 128],
    'max_depth': [8, 16, 32]
    }

In [None]:
for user in le_user.classes_:
    df = all_train.copy()

    # encode
    df['categ_le'] = le_categ.transform(df['categ_agg'])

    df[['oh_categ{}'.format(i) \
              for i in range(vec_size)]] = \
            pd.DataFrame(oh_categ.transform(\
                    df['categ_le'].values.reshape(len(df['categ_le']), 1)).todense(), \
                 index=df.index)
    
    # define target label
    df['is_illegal'] = 0

    df.loc[df['user'] != user, 'is_illegal'] = 1
    X = df.drop(['categ_agg', 'session', 'categ_le', 'user', 'is_illegal'], axis=1)
    y = df['is_illegal']
    
    # randomized grid search
    clf_lgb = LGBMClassifier(random_state=0)

    random_search = RandomizedSearchCV(clf_lgb, scoring='roc_auc', param_distributions=gridParams)
    random_search.fit(X, y)

    # best hyperparameters
    params = dict()    
    params['num_leaves'] = random_search.best_params_['num_leaves']
    params['min_data_in_leaf'] = random_search.best_params_['min_data_in_leaf']
    params['max_bin'] = random_search.best_params_['max_bin']
    params['max_depth'] = random_search.best_params_['max_depth']

    # re-fit models
    clf_lgb = LGBMClassifier(random_state=0, **params)
    clf_lgb.fit(X, y)
    
    # AUC score
    auc = roc_auc_score(y, clf_lgb.predict_proba(X)[:, 1])
    print("ROC AUC for {0}: {1:0.4}".format(user, auc))

    # save models for each user
    exec('clf_lgb_' + user + " = copy.deepcopy(clf_lgb)")
    
    del df, X, y, random_search, clf_lgb, auc