In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from enum import Enum
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

import random
import datetime
import warnings
warnings.filterwarnings('ignore')
     

# Data Processing & Feature Engineering

In [13]:
# Prepare Data functions
# get mean features per second
def get_mean_features_sec(df, mean_features, groupby_features):
  for column_name in mean_features:
    new_column_name = 'mean_second_' + column_name
    df[new_column_name] = df.groupby(groupby_features)[column_name].transform('mean')
  return df

# get mean features per mintue
def get_mean_features_minute(df, mean_features, groupby_features):
  for column_name in mean_features:
    new_column_name = 'mean_minute_' + column_name
    df[new_column_name] = df.groupby(groupby_features)[column_name].transform('mean')
  return df


# add shift features 
def get_shift(n, columns, df, session_column_name):
  features_ = []
  df_data = df
  for i in range(n):
    for c in columns:
      new_feature_name = c + '_shifted_' + str(i+1)
      df_data[new_feature_name] = df_data.groupby(session_column_name)[c].shift(i+1)
      features_ = features_ + [new_feature_name]
  return df_data, features_

In [15]:
%%time
df_data = pd.read_csv('./Train_Mouse.csv')
df_data.shape

# sorted buy timestamp
df_data = df_data.sort_values('timestamp').reset_index()
df_data['index'] = df_data.index

# labeling user_id and session_id
user_id_array = df_data['user_id'].unique()
user_id_map = {val:idx for idx,val in enumerate(user_id_array)}
df_data['user_id_new'] = df_data['user_id'].map(lambda x: user_id_map[x])
session_id_array = df_data['session_id'].unique()
session_id_map = {val:idx for idx,val in enumerate(session_id_array)}
df_data['session_id_new'] = df_data['session_id'].map(lambda x: session_id_map[x])

# labeling + one-hot for event_type
event_list = ['', 'RELEASE', 'MOVE', 'WHEEL', 'DRAG', 'CLICK']
df_data['e'] = df_data['event_type'].map(lambda x: event_list[x])
one_hot_df = pd.get_dummies(df_data['e'], prefix='event')
df_data = pd.concat([df_data, one_hot_df], axis=1)
categorial_cols = ['event_MOVE', 'event_WHEEL', 'event_DRAG', 'event_CLICK']

# time features
df_data['datetime'] = pd.to_datetime(df_data['timestamp'], unit='ms')
df_data['day'] = df_data['datetime'].dt.date
df_data['hour'] = df_data['datetime'].dt.hour
df_data['minute'] = df_data['datetime'].dt.minute
df_data['second'] = df_data['datetime'].dt.second
time_features = ['hour', 'minute', 'second']

# timestamp features
df_data['time_stamp_min'] = df_data.groupby('session_id_new')['timestamp'].transform('min')
df_data['time_stamp'] = df_data['timestamp'] - df_data['time_stamp_min']
df_data['time_diff'] = df_data.groupby('session_id_new')['timestamp'].diff()

# calculate the x_diff y_diff and distance btw (x,y)
df_data['x_diff'] = df_data.groupby('session_id_new')['screen_x'].diff()
df_data['y_diff'] = df_data.groupby('session_id_new')['screen_y'].diff()
df_data['xy_diff'] = np.sqrt(df_data['x_diff']**2 + df_data['y_diff']**2)

# calculate the speed
df_data['x_speed'] = np.abs(df_data['x_diff'] / df_data['time_diff'])
df_data['y_speed'] = np.abs(df_data['y_diff'] / df_data['time_diff'])
df_data['xy_speed'] = df_data['xy_diff'] / df_data['time_diff']

mean_features = ['x_speed', 'y_speed', 'xy_speed', 'screen_x', 'screen_y', 'event_type']
df_data = get_mean_features_sec(df_data.dropna(), mean_features, ['session_id_new', 'day', 'hour', 'minute', 'second'])

# we take 30 step
aggragation_cols = [ 'mean_second_x_speed', 'mean_second_y_speed', 'mean_second_xy_speed', 
                     'mean_second_screen_x', 'mean_second_screen_y', 'mean_second_event_type'
                  ]
n_step = 25
df_data, features_add = get_shift(n_step, aggragation_cols, df_data, 'session_id_new')

CPU times: user 323 ms, sys: 4.96 ms, total: 328 ms
Wall time: 329 ms


In [16]:
df_data.columns

Index(['index', 'uid', 'session_id', 'user_id', 'timestamp', 'event_type',
       'screen_x', 'screen_y', 'user_id_new', 'session_id_new',
       ...
       'mean_second_xy_speed_shifted_24', 'mean_second_screen_x_shifted_24',
       'mean_second_screen_y_shifted_24', 'mean_second_event_type_shifted_24',
       'mean_second_x_speed_shifted_25', 'mean_second_y_speed_shifted_25',
       'mean_second_xy_speed_shifted_25', 'mean_second_screen_x_shifted_25',
       'mean_second_screen_y_shifted_25', 'mean_second_event_type_shifted_25'],
      dtype='object', length=186)

# Data Split

In [17]:
# get test dataset: each user one session id
def get_valid_idx(df_data_, n, random_seed = False):
  if random_seed:
    random.seed()
  idx_valid = []
  for i in df_data_.groupby('user_id_new')['session_id_new'].unique():
    idx_valid.append(random.sample(list(i), 2)[0])
    idx_valid.append(random.sample(list(i), 2)[1])
  return np.sort(idx_valid)

In [25]:
idx_valid = get_valid_idx(df_data, 5)
print(f"The session_id picked as test dataset: \n {idx_valid}")

df_valid = df_data[df_data['session_id_new'].isin(idx_valid)]; 
df_train = df_data[~df_data.index.isin(df_valid.index)]; 
df_valid = df_valid.dropna().reset_index()
df_train = df_train.dropna().reset_index()

df_train.shape, df_valid.shape

The session_id picked as test dataset: 
 [  0   3   5   6  10  11  13  15  17  18  21  22  24  27  28  29  30  34
  40  41  42  44  48  52  55  66  68  71  72  73  73  76  85  89  94  95
 101 108 112 115]


((5606, 187), (2907, 187))

In [26]:
features_for_train =  features_add + aggragation_cols # numerical_cols + categorial_cols

# get the label as well, check the data shape 
df_train_x = df_train[features_for_train]; df_train_y = df_train['user_id_new'] 
df_valid_x = df_valid[features_for_train];  df_valid_y = df_valid['user_id_new'] 
# when need test dataset: 
# df_test_x = df_test[features_for_train];  df_test_y = df_test['user_id_new'] 

df_train_x.shape, df_valid_x.shape

((5606, 156), (2907, 156))

# RF

In [35]:
%%time
rf = RandomForestClassifier(n_estimators=1500, random_state=42)
rf.fit(df_train_x.values, df_train_y.values)

CPU times: user 1min 13s, sys: 588 ms, total: 1min 14s
Wall time: 1min 15s


RandomForestClassifier(n_estimators=1500, random_state=42)

In [36]:
from sklearn.metrics import classification_report

print(f"confusion matrix on train dataset: \n {classification_report(df_train_y.values, rf.predict(df_train_x))}")
print(f"confusion matrix on train dataset: \n {classification_report(df_valid_y.values, rf.predict(df_valid_x))}")

confusion matrix on train dataset: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      1.00      1.00       242
           2       1.00      1.00      1.00       319
           3       1.00      1.00      1.00       318
           4       1.00      1.00      1.00       223
           5       1.00      1.00      1.00       262
           6       1.00      1.00      1.00       367
           7       1.00      1.00      1.00       357
           8       1.00      1.00      1.00       369
           9       1.00      1.00      1.00       304
          10       1.00      1.00      1.00       202
          11       1.00      1.00      1.00       239
          12       1.00      1.00      1.00       318
          13       1.00      1.00      1.00       306
          14       1.00      1.00      1.00       187
          15       1.00      1.00      1.00       412
          16       1.00      1.00      1.00 