In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
def write_to_submission_file(predicted_probs, out_file: str = 'to_submission.csv',
                             target='target', index_label='session_id'):
    df = pd.DataFrame(predicted_probs,
                      index = np.arange(1, len(predicted_probs) + 1),
                      columns=[target])
    df.to_csv(out_file, index_label=index_label)

In [3]:
"""Define all type transformations in a single function"""
def convert_types(df: pd.DataFrame) -> pd.DataFrame:
    sites = [s for s in df.columns if "site" in s]
    df[sites] = df[sites].fillna(0).astype('uint16')
    times = [t for t in df.columns if "time" in t]
    df[times] = df[times].apply(pd.to_datetime)
    if 'target' in df.columns:
        df['target'] = df.target.astype('uint8')
    return df

In [4]:
import os
# os.getcwd()
# os.listdir("../../../../")

train_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv.zip')
train_df = convert_types(train_df)
train_df.sort_values(by='time1', inplace=True)

test_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv.zip')
test_df = convert_types(test_df)

In [5]:
train_df.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54842,54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77291,77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [5]:
sites = [s for s in train_df.columns if 'site' in s]
train_sites_str = train_df[sites].to_string(header=False, index=False).split('\n')
test_sites_str = test_df[sites].to_string(header=False, index=False).split('\n')

In [6]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
X_train = cv.fit_transform(train_sites_str)
X_test = cv.transform(test_sites_str)
X_train.shape, X_test.shape

Wall time: 17.8 s


((253561, 50000), (82797, 50000))

In [7]:
y_train = train_df['target']

In [8]:
time_split = TimeSeriesSplit(n_splits=10)

In [10]:
[(el[0].shape[0], el[1].shape[0]) for el in time_split.split(X_train)]

[(23051, 23051),
 (46102, 23051),
 (69153, 23051),
 (92204, 23051),
 (115255, 23051),
 (138306, 23051),
 (161357, 23051),
 (184408, 23051),
 (207459, 23051),
 (230510, 23051)]

In [9]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [12]:
%%time
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=1)

Wall time: 2min 9s


In [13]:
cv_scores, cv_scores.mean(), cv_scores.std()

(array([0.83141992, 0.6466962 , 0.87991757, 0.9631551 , 0.84221701,
        0.87840646, 0.94475893, 0.85321739, 0.92987546, 0.90752885]),
 0.8677192872976857,
 0.08472533633318961)

In [14]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# Local score 0.868
# Public score 0.913
logit_test_probs = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_probs, 'logit_subm3.csv')

In [16]:
logit_test_probs

array([1.46569226e-03, 2.05174266e-08, 4.69573944e-08, ...,
       2.88023398e-03, 5.58184901e-04, 1.41342417e-05])

In [10]:
def add_time_features(df, X_sparse):
    hour = df['time1'].dt.hour
    morning = hour.between(7, 11).astype(int)
    day = hour.between(12, 18).astype(int)
    evening = hour.between(19, 23).astype(int)
    night = hour.between(0, 6).astype(int)
    X = hstack([X_sparse, morning.values[:, np.newaxis],
                day.values[:, np.newaxis],
                evening.values[:, np.newaxis],
                night.values[:, np.newaxis]])
    return X

In [11]:
X_train_time = add_time_features(train_df, X_train)

In [19]:
%%time
cv_scores = cross_val_score(logit, X_train_time, y_train, cv=time_split, scoring='roc_auc', n_jobs=1)

Wall time: 2min 23s


In [20]:
cv_scores, cv_scores.mean(), cv_scores.std()

(array([0.87652191, 0.75122963, 0.93062062, 0.978644  , 0.90399626,
        0.93831429, 0.96248922, 0.92731291, 0.94885535, 0.94043836]),
 0.9158422561223576,
 0.061199291822115025)

In [12]:
%%time
logit.fit(X_train_time, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
# Local score 0.916
# Public score 0.938
X_test_time = add_time_features(test_df, X_test)
logit_test_probs = logit.predict_proba(X_test_time)[:, 1]
write_to_submission_file(logit_test_probs, 'logit_subm4.csv')

In [13]:
c_values = np.logspace(-2, 2, 10)
logit_grid_searcher = GridSearchCV(estimator=logit, 
                                   param_grid={'C': c_values}, 
                                   scoring='roc_auc', 
                                   cv=time_split,
                                   n_jobs=1,
                                   verbose=1)

In [14]:
%%time
logit_grid_searcher.fit(X_train_time, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 31.9min finished


Wall time: 32min 16s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=F

In [16]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9173778863865611, {'C': 0.21544346900318834})

In [20]:
# Local score 0.917
# Public score 
logit_test_probs = logit_grid_searcher.predict_proba(X_test_time)[:, 1]
write_to_submission_file(logit_test_probs, 'logit_subm5.csv')