# <a href='https://www.kaggle.com/c/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2'>Alice Kaggle competition</a>

## (work in progress)

In [19]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from pathlib import Path

In [2]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
def validate(X_train, y_train, cv, ):
    logit = LogisticRegression(C=1, random_state=17)
    cv_scores = cross_val_score(logit, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    print(cv_scores, cv_scores.mean())

In [4]:
PATH_TO_DATA = Path('.')

In [5]:
train_df = pd.read_csv(PATH_TO_DATA/'train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv(PATH_TO_DATA/'test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [6]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', index=None, header=None)

In [7]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [8]:
%%time
cv = TfidfVectorizer(ngram_range=(1, 3), max_features=60000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

CPU times: user 12.5 s, sys: 212 ms, total: 12.7 s
Wall time: 12.7 s


In [9]:
y_train = train_df['target'].astype('int')

In [10]:
(train_df.iloc[2,:]['time10'] - train_df.iloc[2,:]['time1']).total_seconds()

4.0

In [11]:
def add_time_features(df, X_sparse, scalers=None, column='time1'):
    hour = df[column].dt.hour
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    dow = df[column].dt.dayofweek
    is_weekday = (dow < 5).astype('int')
    session_length = (df[times].max(axis=1) - df['time1']).apply(lambda x: x.total_seconds())
    
    
    if not scalers:
        scalers = [StandardScaler() for _ in range(2)]
        scalers[0].fit(dow.values.reshape(-1, 1))
        scalers[1].fit(session_length.values.reshape(-1, 1))
    
    dow = scalers[0].transform(dow.values.reshape(-1, 1))
    session_length = scalers[1].transform(session_length.values.reshape(-1, 1))
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1), dow, is_weekday.values.reshape(-1, 1),
                session_length]).tocsr()
    return X, scalers

In [12]:
%%time
X_train_new, sc = add_time_features(train_df.fillna(0), X_train)
X_test_new, _ = add_time_features(test_df.fillna(0), X_test, scalers=sc)



CPU times: user 3.5 s, sys: 100 ms, total: 3.6 s
Wall time: 3.6 s


In [13]:
X_train_new.shape, X_test_new.shape

((253561, 60007), (82797, 60007))

In [16]:
%%time
validate(X_train_new, y_train, time_split)

[ 0.8725817   0.82403298  0.82745999  0.96486041  0.9114428   0.95009921
  0.94533402  0.94102407  0.96086367  0.96512831] 0.916282714429
CPU times: user 1.45 s, sys: 81.2 ms, total: 1.54 s
Wall time: 13.4 s


## LightGBM

In [17]:
import lightgbm as lgb

In [21]:
train_part_size = int(0.7 * y_train.shape[0])
X_train_part = X_train_new[:train_part_size, :]
y_train_part = y_train[:train_part_size]
X_valid =  X_train_new[train_part_size:, :]
y_valid = y_train[train_part_size:]

In [22]:
lgb_x_train_part = lgb.Dataset(X_train_part.astype(np.float32), label=y_train_part)
lgb_x_valid = lgb.Dataset(X_valid.astype(np.float32), label=y_valid)

In [27]:
param = {'num_leaves': 31, 'num_trees': 200, 'objective': 'binary', 'metric': 'auc'}

In [28]:
lgb_best = lgb.train(param, lgb_x_train_part, valid_sets=[lgb_x_valid], early_stopping_rounds=20)



[1]	valid_0's auc: 0.690417
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.865808
[3]	valid_0's auc: 0.873214
[4]	valid_0's auc: 0.912131
[5]	valid_0's auc: 0.917753
[6]	valid_0's auc: 0.909673
[7]	valid_0's auc: 0.908866
[8]	valid_0's auc: 0.910445
[9]	valid_0's auc: 0.926096
[10]	valid_0's auc: 0.914799
[11]	valid_0's auc: 0.924381
[12]	valid_0's auc: 0.928376
[13]	valid_0's auc: 0.929766
[14]	valid_0's auc: 0.93146
[15]	valid_0's auc: 0.932404
[16]	valid_0's auc: 0.933404
[17]	valid_0's auc: 0.934239
[18]	valid_0's auc: 0.935311
[19]	valid_0's auc: 0.935598
[20]	valid_0's auc: 0.936622
[21]	valid_0's auc: 0.936671
[22]	valid_0's auc: 0.937569
[23]	valid_0's auc: 0.937261
[24]	valid_0's auc: 0.937099
[25]	valid_0's auc: 0.937656
[26]	valid_0's auc: 0.938203
[27]	valid_0's auc: 0.938771
[28]	valid_0's auc: 0.938977
[29]	valid_0's auc: 0.939996
[30]	valid_0's auc: 0.940371
[31]	valid_0's auc: 0.940737
[32]	valid_0's auc: 0.940751
[33]	valid_0's auc: 

In [29]:
lgb_x_train = lgb.Dataset(X_train_new.astype(np.float32), label=y_train)

In [30]:
%%time
lgb_cv = lgb.cv(param, lgb_x_train, early_stopping_rounds=20, folds=time_split)



CPU times: user 22min 57s, sys: 46.7 s, total: 23min 44s
Wall time: 7min 20s


In [31]:
np.mean(lgb_cv['auc-mean'])

0.89217608678285298

## Train Logistic regression on the same train set

In [190]:
%%time
logit_train = LogisticRegression(C=1.6681005372000592, random_state=17)
logit_train.fit(X_train_part, y_train_part)

CPU times: user 4.42 s, sys: 148 ms, total: 4.57 s
Wall time: 2.34 s


In [191]:
logit_predictions = logit_train.predict_proba(X_valid)[:, 1]
lgb_predictions = lgb_best.predict(X_valid.astype(np.float32), num_iteration=lgb_best.best_iteration)

In [192]:
roc_auc_score(y_valid, logit_predictions)

0.94188422274516181

In [193]:
roc_auc_score(y_valid, lgb_predictions)

0.94530180473676972

In [194]:
betas = np.linspace(0, 1, 11, endpoint=True)

In [195]:
for beta in betas:
    overall_predictions = beta * logit_predictions + (1 - beta) * lgb_predictions
    print(beta, roc_auc_score(y_valid, overall_predictions))

0.0 0.945301804737
0.1 0.949829470114
0.2 0.950715451004
0.3 0.950863521429
0.4 0.950636258027
0.5 0.950025162319
0.6 0.949077061126
0.7 0.94785876879
0.8 0.946300896819
0.9 0.944434429654
1.0 0.941884222745


In [202]:
best_beta=0.3

## Tune hyperparameters

In [34]:
logit = LogisticRegression(random_state=17, C=1)

In [35]:
time_split = TimeSeriesSplit(n_splits=10)

In [42]:
c_values = np.logspace(-2, 2, 10)
logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [46]:
%%time
logit_grid_searcher.fit(X_train_new, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


ImportError: [joblib] Attempting to do parallel computing without protecting your import on a system that does not support forking. To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'". Please see the joblib documentation on Parallel for more information

In [40]:
logit_grid_searcher.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [41]:
logit_grid_searcher.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

## Train on full dataset

In [197]:
%%time
c = logit_grid_searcher.best_params_['C']
logit = LogisticRegression(C=1, random_state=17)
logit.fit(X_train_new, y_train)

CPU times: user 5.3 s, sys: 168 ms, total: 5.47 s
Wall time: 2.79 s


In [198]:
logit_preds = logit.predict_proba(X_test_new)[:, 1]

In [199]:
lgb_preds = lgb_best.predict(X_test_new.astype(np.float32), num_iteration=lgb_best.best_iteration)

In [220]:
best_beta=0.4

In [221]:
combined_predictions = best_beta * logit_preds + (1 - best_beta) * lgb_preds

In [222]:
write_to_submission_file(combined_predictions, 'combined.csv', )

In [77]:
!pwd

/home/vitaly/Downloads/dev/kaggle/alice
