In [2]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from enum import Enum
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

from sklearn.ensemble import IsolationForest

import random
import datetime
import warnings
warnings.filterwarnings('ignore')
     

# Prepare Data functions
# get mean features per second
def get_mean_features_sec(df, mean_features, groupby_features):
  for column_name in mean_features:
    new_column_name = 'mean_second_' + column_name
    df[new_column_name] = df.groupby(groupby_features)[column_name].transform('mean')
  return df

# get mean features per mintue
def get_mean_features_minute(df, mean_features, groupby_features):
  for column_name in mean_features:
    new_column_name = 'mean_minute_' + column_name
    df[new_column_name] = df.groupby(groupby_features)[column_name].transform('mean')
  return df


# add shift features 
def get_shift(n, columns, df, session_column_name):
  features_ = []
  df_data = df
  for i in range(n):
    for c in columns:
      new_feature_name = c + '_shifted_' + str(i+1)
      df_data[new_feature_name] = df_data.groupby(session_column_name)[c].shift(i+1)
      features_ = features_ + [new_feature_name]
  return df_data, features_

df_data = pd.read_csv('./Train_Mouse.csv')
df_data.shape

# sorted buy timestamp
df_data = df_data.sort_values('timestamp').reset_index().drop(labels='index', axis=1)

# labeling user_id and session_id
user_id_array = df_data['user_id'].unique()
user_id_map = {val:idx for idx,val in enumerate(user_id_array)}
df_data['user_id_new'] = df_data['user_id'].map(lambda x: user_id_map[x])
session_id_array = df_data['session_id'].unique()
session_id_map = {val:idx for idx,val in enumerate(session_id_array)}
df_data['session_id_new'] = df_data['session_id'].map(lambda x: session_id_map[x])

# labeling + one-hot for event_type
event_list = ['', 'RELEASE', 'MOVE', 'WHEEL', 'DRAG', 'CLICK']
df_data['e'] = df_data['event_type'].map(lambda x: event_list[x])
one_hot_df = pd.get_dummies(df_data['e'], prefix='event')
df_data = pd.concat([df_data, one_hot_df], axis=1)
categorial_cols = ['event_MOVE', 'event_WHEEL', 'event_DRAG', 'event_CLICK']

# time features
df_data['datetime'] = pd.to_datetime(df_data['timestamp'], unit='ms')
df_data['day'] = df_data['datetime'].dt.date
df_data['hour'] = df_data['datetime'].dt.hour
df_data['minute'] = df_data['datetime'].dt.minute
df_data['second'] = df_data['datetime'].dt.second
time_features = ['hour', 'minute', 'second']

# timestamp features
df_data['time_stamp_min'] = df_data.groupby('session_id_new')['timestamp'].transform('min')
df_data['time_stamp'] = df_data['timestamp'] - df_data['time_stamp_min']
df_data['time_diff'] = df_data.groupby('session_id_new')['timestamp'].diff()

# calculate the x_diff y_diff and distance btw (x,y)
df_data['x_diff'] = df_data.groupby('session_id_new')['screen_x'].diff()
df_data['y_diff'] = df_data.groupby('session_id_new')['screen_y'].diff()
df_data['xy_diff'] = np.sqrt(df_data['x_diff']**2 + df_data['y_diff']**2)

# calculate the speed
df_data['x_speed'] = np.abs(df_data['x_diff'] / df_data['time_diff'])
df_data['y_speed'] = np.abs(df_data['y_diff'] / df_data['time_diff'])
df_data['xy_speed'] = df_data['xy_diff'] / df_data['time_diff']

df_data = df_data.dropna()



# Anormaly Detection
model = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.05), random_state=42)
anomaly_features = ['screen_x', 'screen_y', 'x_speed', 'y_speed', 'xy_speed']
anomaly_mask_pd = pd.DataFrame()

for column_name in anomaly_features:
  model.fit(df_data[column_name].values.reshape(-1, 1))
  # Make predictions on the data
  anomaly_mask = model.predict(df_data[column_name].values.reshape(-1, 1)) == -1
  index_new_name = 'ab_'+column_name
  anomaly_mask_pd[index_new_name] = anomaly_mask

anomaly_mask_pd['pick_index'] = ~(anomaly_mask_pd.iloc[:, 0:5].any(axis=1))

df_data = df_data[anomaly_mask_pd['pick_index'].values]
df_data = df_data.reset_index().drop(labels='index', axis=1)

# Feature Engineering
mean_features = ['x_speed', 'y_speed', 'xy_speed', 'screen_x', 'screen_y', 'event_type']
df_data = get_mean_features_sec(df_data.dropna(), mean_features, ['session_id_new', 'day', 'hour', 'minute', 'second'])

# we take 30 step
aggragation_cols = [ 'mean_second_x_speed', 'mean_second_y_speed', 'mean_second_xy_speed', 
                     'mean_second_screen_x', 'mean_second_screen_y', 'mean_second_event_type'
                  ]
n_step = 30
df_data, features_add = get_shift(n_step, aggragation_cols, df_data, 'session_id_new')

df_data.columns
     

CPU times: user 6.02 s, sys: 29.9 ms, total: 6.05 s
Wall time: 6.18 s


Index(['uid', 'session_id', 'user_id', 'timestamp', 'event_type', 'screen_x',
       'screen_y', 'user_id_new', 'session_id_new', 'e',
       ...
       'mean_second_xy_speed_shifted_29', 'mean_second_screen_x_shifted_29',
       'mean_second_screen_y_shifted_29', 'mean_second_event_type_shifted_29',
       'mean_second_x_speed_shifted_30', 'mean_second_y_speed_shifted_30',
       'mean_second_xy_speed_shifted_30', 'mean_second_screen_x_shifted_30',
       'mean_second_screen_y_shifted_30', 'mean_second_event_type_shifted_30'],
      dtype='object', length=215)

# Data Split

In [3]:
# get test dataset: each user one session id
def get_valid_idx(df_data_, n, random_seed = False):
  if random_seed:
    random.seed()
  idx_valid = []
  for i in df_data_.groupby('user_id_new')['session_id_new'].unique():
    idx_valid.append(random.sample(list(i), 2)[0])
    idx_valid.append(random.sample(list(i), 2)[1])
  return np.sort(idx_valid)

idx_valid = get_valid_idx(df_data, 5)
print(f"The session_id picked as test dataset: \n {idx_valid}")

df_valid = df_data[df_data['session_id_new'].isin(idx_valid)]; 
df_train = df_data[~df_data.index.isin(df_valid.index)]; 
df_valid = df_valid.dropna().reset_index().drop(labels='index', axis=1)
df_train = df_train.dropna().reset_index().drop(labels='index', axis=1)

features_for_train =  features_add + aggragation_cols # numerical_cols + categorial_cols

# get the label as well, check the data shape 
df_train_x = df_train[features_for_train]; df_train_y = df_train['user_id_new'] 
df_valid_x = df_valid[features_for_train];  df_valid_y = df_valid['user_id_new'] 
# when need test dataset: 
# df_test_x = df_test[features_for_train];  df_test_y = df_test['user_id_new'] 

df_train_x.shape, df_valid_x.shape


The session_id picked as test dataset: 
 [  1   4   9  11  11  12  27  30  33  34  35  36  41  50  53  55  56  61
  66  66  76  79  80  83  86  89  91  95  96  99 103 105 107 108 111 112
 113 116 117 119]


((4226, 186), (1972, 186))

In [18]:
# (0,1) Normalization 
from sklearn.preprocessing import StandardScaler

# create a StandardScaler object
scaler = StandardScaler()
# fit the scaler to the training data and transform the training data
X_train_scaled = scaler.fit_transform(df_train_x)

# transform the testing data using the scaler fitted on the training data
X_valid_scaled = scaler.transform(df_valid_x)


# LR

In [38]:
%%time
from sklearn.linear_model import LogisticRegression

# create a Logistic Regression model
model = LogisticRegression(multi_class='ovr', C=10)

# train the model on the training set
model.fit(X_train_scaled, df_train_y)

CPU times: user 4.81 s, sys: 2.09 s, total: 6.91 s
Wall time: 5.6 s


LogisticRegression(C=10, multi_class='ovr')

# Evaluation

In [39]:
from sklearn.metrics import classification_report

y_pred_train = model.predict(X_train_scaled)
print(f"confusion matrix on train dataset: \n {classification_report(df_train_y.values, y_pred_train)}")

y_pred_valid = model.predict(X_valid_scaled)
print(f"confusion matrix on test dataset: \n {classification_report(df_valid_y.values, y_pred_valid)}")
     

confusion matrix on train dataset: 
               precision    recall  f1-score   support

           0       0.70      0.58      0.63       126
           1       0.39      0.35      0.37       158
           2       0.53      0.65      0.59       268
           3       0.43      0.34      0.38       255
           4       0.79      0.91      0.85       225
           5       0.72      0.75      0.74       247
           6       0.36      0.38      0.36       349
           7       0.41      0.21      0.28       291
           8       0.58      0.83      0.68       356
           9       0.14      0.06      0.09       277
          10       0.53      0.57      0.55       118
          11       0.61      0.28      0.39       128
          12       0.96      0.98      0.97       116
          13       0.65      0.86      0.74       187
          14       0.59      0.57      0.58       106
          15       0.62      0.70      0.66       376
          16       1.00      1.00      1.00 