In [1]:
import numpy as np
import pandas as pd
from math import isclose
from scipy.sparse import hstack, vstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from typing import Union, List

In [62]:
def write_to_submission_file(predicted_probs, out_file: str = 'to_submission.csv',
                             target='target', index_label='session_id'):
    df = pd.DataFrame(predicted_probs,
                      index = np.arange(1, len(predicted_probs) + 1),
                      columns=[target])
    df.to_csv(out_file, index_label=index_label, compression="zip")

In [3]:
"""Define all type transformations in a single function"""
def convert_types(df: pd.DataFrame) -> pd.DataFrame:
    sites = [s for s in df.columns if "site" in s]
    df[sites] = df[sites].fillna(0).astype('uint16')
    times = [t for t in df.columns if "time" in t]
    df[times] = df[times].apply(pd.to_datetime)
    if 'target' in df.columns:
        df['target'] = df.target.astype('uint8')
    return df

In [4]:
import os
# os.getcwd()
# os.listdir("../../../../")

train_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv.zip')
train_df = convert_types(train_df)
train_df.sort_values(by='time1', inplace=True)

test_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv.zip')
test_df = convert_types(test_df)

In [5]:
sites = [s for s in train_df.columns if 'site' in s]
times = [t for t in train_df.columns if 'time' in t]

sites, times

(['site1',
  'site2',
  'site3',
  'site4',
  'site5',
  'site6',
  'site7',
  'site8',
  'site9',
  'site10'],
 ['time1',
  'time2',
  'time3',
  'time4',
  'time5',
  'time6',
  'time7',
  'time8',
  'time9',
  'time10'])

In [6]:
# Our target variable
y_train = train_df["target"]

# United dataframe of the initial data
full_df = pd.concat([train_df.drop("target", axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [7]:
%%time
sites_corpus = full_df[sites].to_string(header=False, index=False).split('\n')

sites_corpus[:10]

Wall time: 8.01 s


['   56    55     0     0     0     0     0     0     0     0',
 '   56    55    56    55     0     0     0     0     0     0',
 '  946   946   951   946   946   945   948   784   949   946',
 '  945   948   949   948   945   946   947   945   946   946',
 '  947   950   948   947   950   952   946   951   946   947',
 '  952   947   953   946   947   946   953   955   946   947',
 '  953   947   946   953   955   947   953   946   953  1033',
 '  946   947   954   953   946   954   946   956   957   956',
 '  946   956   946   946   955   954   946   946   946   948',
 '  948   946   948   784    49    53   812   982    52    52']

In [8]:
%%time
tfv = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train = tfv.fit_transform(sites_corpus[:idx_split])
X_test = tfv.transform(sites_corpus[idx_split:])

X_full = vstack([X_train, X_test]).tocsr()
X_full

Wall time: 7.93 s


<336358x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 3251875 stored elements in Compressed Sparse Row format>

In [67]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)

def get_auc_logit_score(X, y, C=1.0, seed=17, n_splits=10):
    # Split the data into the training and validation sets
    time_split = TimeSeriesSplit(n_splits=n_splits)
    logit = LogisticRegression(C=C, random_state=17, solver='liblinear')
    cv_scores = cross_val_score(logit, X, y, cv=time_split, scoring='roc_auc', n_jobs=-1)
    return cv_scores.mean()

In [10]:
%%time
get_auc_logit_score(X_train, y_train)

Wall time: 7.79 s


0.8692386533117059

In [65]:
# Add the new feature to the sparse matrix
def add_feature(feat: str, f_df: pd.DataFrame = feat_df, X_sparse = X_full, standardize=True, onehot=False):
    tmp = f_df[[feat]].values    
    if onehot:
        enc = OneHotEncoder(dtype=np.uint8, sparse=False)
        tmp = enc.fit_transform(tmp)
    if standardize:
        tmp = StandardScaler().fit_transform(tmp)        
    return hstack([X_sparse, tmp]).tocsr()

In [66]:
def add_multi_feature(feat_list: list, f_df: pd.DataFrame = feat_df, X_sparse = X_full):
    X_new = X_sparse
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    return X_new[:idx_split, :], X_new[idx_split:, :]     

In [64]:
def test_feature(feat: str, standardize=True, onehot=False, baseline=0.8693, C=1):
    print(f"Testing:\t{feat}")
    
    X_new = add_feature(feat, standardize=standardize, onehot=onehot)
    X_train = X_new[:idx_split, :]
    score = get_auc_logit_score(X_train, y_train, C=C)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print("+++")
        return True
    else:
        print("---")
        return False

In [71]:
def test_multi_feature(feat_list: list, baseline=0.8693, C=1):    
    print(f"Testing:\t{feat_list}")
    
    X_new = X_full
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    X_train = X_new[:idx_split, :]
    score = get_auc_logit_score(X_train, y_train, C=C)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print("+++")
    else:
        print("---")
    return
    

In [18]:
# Features engineering
feat_df = pd.DataFrame(index=full_df.index)
feat_df['weekday'] = full_df['time1'].dt.weekday
feat_df.head()

Unnamed: 0,weekday
21668,5
54842,5
77291,5
114020,5
146669,5


In [22]:
test_feature('weekday')

Testing:	weekday
Score:		0.8668	---


False

In [23]:
test_feature('weekday', onehot=True, standardize=False)

Testing:	weekday
Score:		0.8446	---


False

In [24]:
test_feature('weekday', onehot=True, standardize=True)

Testing:	weekday
Score:		0.8368	---


False

In [70]:
best_weekdays = []
for weekday in range(7):
    feat_name = f'weekday_{weekday}'
    feat_df[feat_name] = (full_df['time1'].dt.weekday == weekday).astype(int)
    if test_feature(feat_name):
        best_weekdays.append(feat_name)

Testing:	weekday_0
Score:		0.8776	+++
Testing:	weekday_1
Score:		0.8680	---
Testing:	weekday_2
Score:		0.8644	---
Testing:	weekday_3
Score:		0.8509	---
Testing:	weekday_4
Score:		0.8442	---
Testing:	weekday_5
Score:		0.8540	---
Testing:	weekday_6
Score:		0.8690	---


In [27]:
feat_df['hour'] = full_df['time1'].dt.hour.astype(int)

In [28]:
test_feature('hour')

Testing:	hour
Score:		0.8991	+++


True

In [29]:
test_feature('hour', onehot=True)

Testing:	hour
Score:		0.9064	+++


True

In [69]:
best_hours = []
for hour in range(23):
    feat_name = f'hour_{hour}'
    feat_df[feat_name] = (full_df['time1'].dt.hour == hour).astype(int)
    if test_feature(feat_name):
        best_hours.append(feat_name)
print(best_hours)

Testing:	hour_0
Score:		0.8692	---
Testing:	hour_1
Score:		0.8692	---
Testing:	hour_2
Score:		0.8692	---
Testing:	hour_3
Score:		0.8692	---
Testing:	hour_4
Score:		0.8692	---
Testing:	hour_5
Score:		0.8692	---
Testing:	hour_6
Score:		0.8692	---
Testing:	hour_7
Score:		0.8694	+++
Testing:	hour_8
Score:		0.8798	+++
Testing:	hour_9
Score:		0.8768	+++
Testing:	hour_10
Score:		0.8838	+++
Testing:	hour_11
Score:		0.8822	+++
Testing:	hour_12
Score:		0.8542	---
Testing:	hour_13
Score:		0.8487	---
Testing:	hour_14
Score:		0.8806	+++
Testing:	hour_15
Score:		0.8592	---
Testing:	hour_16
Score:		0.8746	+++
Testing:	hour_17
Score:		0.8763	+++
Testing:	hour_18
Score:		0.8730	+++
Testing:	hour_19
Score:		0.8699	+++
Testing:	hour_20
Score:		0.8696	+++
Testing:	hour_21
Score:		0.8699	+++
Testing:	hour_22
Score:		0.8698	+++
['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22']


In [33]:
test_multi_feature(best_weekdays)

Testing:	['weekday_0']
Score:		0.8776	+++


In [31]:
test_multi_feature(best_hours)

Testing:	['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22']
Score:		0.9184	+++


In [32]:
test_multi_feature(best_hours+best_weekdays)

Testing:	['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'weekday_0']
Score:		0.9193	+++


In [59]:
def predict_probs(feat_list: list,  C=1):
    X_new = X_full
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    X_train = X_new[:idx_split, :]
    X_test = X_new[idx_split:, :]
    estimator = LogisticRegression(C=C, random_state=17, solver='liblinear')
    estimator.fit(X_train, y_train)
    return estimator.predict_proba(X_test)[:, 1]

In [42]:
%%time
y_pred = predict_probs(best_hours + best_weekdays)
write_to_submission_file(y_pred, out_file='logit_subm8.zip.csv')

Wall time: 7.82 s


In [52]:
### Optimize regularization for best set of features

c_values = np.logspace(0, 2, 10)
logit_grid_searcher = GridSearchCV(estimator=logit,
                                   param_grid={'C': c_values},
                                   scoring='roc_auc',
                                   cv=time_split,
                                   n_jobs=-1,
                                   verbose=1)

In [49]:
X_train_new, X_test_new = add_multi_feature(best_hours + best_weekdays)

In [53]:
logit_grid_searcher.fit(X_train_new, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
             estimator=LogisticRegression(C=1, random_state=17,
                                          solver='liblinear'),
             n_jobs=1,
             param_grid={'C': array([  0.1       ,   0.21544347,   0.46415888,   1.        ,
         2.15443469,   4.64158883,  10.        ,  21.5443469 ,
        46.41588834, 100.        ])},
             scoring='roc_auc', verbose=1)

In [54]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9253743965862873, {'C': 46.41588833612777})

In [57]:
logit_test_probs = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_probs, 'logit_subm9.csv.zip')

In [72]:
test_multi_feature(best_hours + best_weekdays, C=4.64)

Testing:	['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'weekday_0']
Score:		0.9239	+++


In [63]:
%%time
write_to_submission_file(predict_probs(best_hours + best_weekdays, C=4.64), 'logit_subm10.csv.zip')

Wall time: 10.5 s
