In [None]:
import pandas as pd
import numpy as np

import os
import pickle
import copy

# preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# algorithms
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier

# optimization
from sklearn.model_selection import RandomizedSearchCV

# performance
from sklearn.metrics import roc_auc_score

from joblib import dump
from joblib import load

# random seed 
np.random.seed(0)

In [None]:
data_dir = '/Users/yangyulong/Documents/yzkj/'

In [None]:
le_user = LabelEncoder()
le_categ = LabelEncoder()

oh_user = OneHotEncoder()
oh_categ = OneHotEncoder()

## Apply to Test Data ##

Owner of the challenge dataset did not release the true labels of their entire test set, only the subset that was used to calculate the public leaderboard score. I use this subset to calculate my public leaderboard score. 

In [None]:
# all test data
all_test = pd.read_pickle(data_dir + 'all_testing_aggregation.pickle')

# public leaderboard subset
public_test_labels = pd.read_csv(data_dir + 'public_labels.csv')
public_test = all_test[all_test['session'].isin(public_test_labels['filename'])]

public_test['categ_le'] = le_categ.fit_transform(public_test['categ_agg'])
vec_size = public_test['categ_agg'].nunique()
public_test[['oh_categ{}'.format(i) for i in range(vec_size)]] = \
        pd.DataFrame(oh_categ.fit_transform(\
        public_test['categ_le'].values.reshape(len(public_test['categ_le']), 1)).todense(), index=public_test.index)

The submission requirement for the challenge was that each test mouse session has an anomaly score between 0 and 1 that tells how unlikely the remote session was carried out by the respective user account, i.e., a measure of `is_illegal`=1. My classification model gives a predicted probability of `is_illegal`=1 for each mouse action in a given session; then the anomaly score of the session is the mean of the predicted probability of all its actions. 

In [None]:
session_proba = dict()

threshold = 0.82

for session in public_test['session'].unique():
    user_test = public_test.loc[public_test['session'] == session, 'user'].unique()[0]
    data_test = public_test[(public_test['session'] == session)]\
                .drop(['categ_agg', 'session', 'categ_le', 'user'], axis=1)

    # load model
    model_filename = f'clf_lgb_{user_test}.joblib'
    model = load(model_filename)
    
    # apply model
    proba = model.predict_proba(data_test)[:, 1]

    session_mean_proba = np.mean(proba)
    session_proba[session] = np.mean(proba)

    if session_mean_proba > threshold:
        print(f"Warning: session {session}，prob {session_mean_proba}")

Now calculate final public score. 

In [None]:
results = pd.DataFrame.from_dict(session_proba, orient='index', columns=['pred_proba'])
public_test_labels.set_index('filename', inplace=True)
compare_to_label = public_test_labels.join(results, sort=False)
print('Final ROC AUC (public score): {0:0.4}'.format(roc_auc_score(compare_to_label['is_illegal'], compare_to_label['pred_proba'], average='macro')))