In [1]:
import numpy as np
import pandas as pd
from math import isclose
from scipy.sparse import hstack, vstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from typing import Union, List

In [2]:
def write_to_submission_file(predicted_probs, out_file: str = 'to_submission.csv',
                             target='target', index_label='session_id'):
    df = pd.DataFrame(predicted_probs,
                      index = np.arange(1, len(predicted_probs) + 1),
                      columns=[target])
    df.to_csv(out_file, index_label=index_label)

In [3]:
"""Define all type transformations in a single function"""
def convert_types(df: pd.DataFrame) -> pd.DataFrame:
    sites = [s for s in df.columns if "site" in s]
    df[sites] = df[sites].fillna(0).astype('uint16')
    times = [t for t in df.columns if "time" in t]
    df[times] = df[times].apply(pd.to_datetime)
    if 'target' in df.columns:
        df['target'] = df.target.astype('uint8')
    return df

In [4]:
import os
# os.getcwd()
# os.listdir("../../../../")

train_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv.zip')
train_df = convert_types(train_df)
train_df.sort_values(by='time1', inplace=True)

test_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv.zip')
test_df = convert_types(test_df)

In [5]:
sites = [s for s in train_df.columns if 'site' in s]
times = [t for t in train_df.columns if 'time' in t]

sites, times

(['site1',
  'site2',
  'site3',
  'site4',
  'site5',
  'site6',
  'site7',
  'site8',
  'site9',
  'site10'],
 ['time1',
  'time2',
  'time3',
  'time4',
  'time5',
  'time6',
  'time7',
  'time8',
  'time9',
  'time10'])

In [6]:
# Our target variable
y_train = train_df["target"]

# United dataframe of the initial data
full_df = pd.concat([train_df.drop("target", axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [7]:
%%time
sites_corpus = full_df[sites].to_string(header=False, index=False).split('\n')

sites_corpus[:10]

Wall time: 7.9 s


['   56    55     0     0     0     0     0     0     0     0',
 '   56    55    56    55     0     0     0     0     0     0',
 '  946   946   951   946   946   945   948   784   949   946',
 '  945   948   949   948   945   946   947   945   946   946',
 '  947   950   948   947   950   952   946   951   946   947',
 '  952   947   953   946   947   946   953   955   946   947',
 '  953   947   946   953   955   947   953   946   953  1033',
 '  946   947   954   953   946   954   946   956   957   956',
 '  946   956   946   946   955   954   946   946   946   948',
 '  948   946   948   784    49    53   812   982    52    52']

In [8]:
%%time
# tfv = TfidfVectorizer(ngram_range=(1,5), max_features=100000) # TODO optimize ngram_range and max_features
tfv = TfidfVectorizer()
X_train = tfv.fit_transform(sites_corpus[:idx_split])
X_test = tfv.transform(sites_corpus[idx_split:])

X_full = vstack([X_train, X_test]).tocsr()
X_full

Wall time: 3.68 s


<336358x41592 sparse matrix of type '<class 'numpy.float64'>'
	with 1818178 stored elements in Compressed Sparse Row format>

In [9]:
def get_auc_logit_score(X, y, C=1.0, seed=17, n_splits=10):
    # Split the data into the training and validation sets
    time_split = TimeSeriesSplit(n_splits=n_splits)
    logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
    cv_scores = cross_val_score(logit, X, y, cv=time_split, scoring='roc_auc', n_jobs=-1)
    return cv_scores.mean()

In [10]:
%%time
get_auc_logit_score(X_train, y_train)

Wall time: 6.66 s


0.863111282374818

In [11]:
# Features engineering
feat_df = pd.DataFrame(index=full_df.index)
feat_df['weekday'] = full_df['time1'].dt.weekday
feat_df

Unnamed: 0,weekday
21668,5
54842,5
77291,5
114020,5
146669,5
...,...
82792,3
82793,0
82794,4
82795,5


In [12]:
type(X_full)

scipy.sparse.csr.csr_matrix

In [13]:
# Add the new feature to the sparse matrix
def add_feature(feat: str, f_df: pd.DataFrame = feat_df, X_sparse = X_full, standardize=True, onehot=False):
    tmp = f_df[[feat]].values    
    if onehot:
        enc = OneHotEncoder(dtype=np.uint8, sparse=False)
        tmp = enc.fit_transform(tmp)
    if standardize:
        tmp = StandardScaler().fit_transform(tmp)        
    return hstack([X_sparse, tmp]).tocsr()

In [14]:
def test_feature(feat: str, standardize=True, onehot=False, baseline=0.8632):
    print(f"Testing:\t{feat}")
    
    X_new = add_feature(feat, standardize=standardize, onehot=onehot)
    score = get_auc_logit_score(X_new[:idx_split, :], y_train)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print("+++")
        return True
    else:
        print("---")
        return False

In [15]:
def test_multi_feature(feat_list: list, baseline=0.8632):    
    print(f"Testing:\t{feat_list}")
    
    X_new = X_full
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    score = get_auc_logit_score(X_new[:idx_split, :], y_train)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print("+++")
    else:
        print("---")
    return
    

In [16]:
test_feature('weekday', standardize=False)

Testing:	weekday
Score:		0.8627	---


False

In [17]:
test_feature('weekday')

Testing:	weekday
Score:		0.8631	---


False

In [18]:
test_feature('weekday', onehot=True, standardize=False)

Testing:	weekday
Score:		0.8410	---


False

In [19]:
test_feature('weekday', onehot=True, standardize=True)

Testing:	weekday
Score:		0.8331	---


False

In [20]:
feat_df['workday'] = (full_df['time1'].dt.weekday < 5).astype(int) 
feat_df['weekend'] = (full_df['time1'].dt.weekday > 4).astype(int)

In [21]:
test_feature('workday')
test_feature('weekend')

Testing:	workday
Score:		0.8505	---
Testing:	weekend
Score:		0.8505	---


False

In [22]:
for weekday in range(7):
    feat_df[f'weekday_{weekday}'] = (full_df['time1'].dt.weekday == weekday).astype(int)
    test_feature(f'weekday_{weekday}')

Testing:	weekday_0
Score:		0.8723	+++
Testing:	weekday_1
Score:		0.8625	---
Testing:	weekday_2
Score:		0.8592	---
Testing:	weekday_3
Score:		0.8450	---
Testing:	weekday_4
Score:		0.8389	---
Testing:	weekday_5
Score:		0.8494	---
Testing:	weekday_6
Score:		0.8637	+++


In [30]:
best_weekdays = []
for weekday in range(7):
    feat_name = f'weekday_{weekday}'
    feat_df[feat_name] = (full_df['time1'].dt.weekday == weekday).astype(int)
    if test_feature(feat_name):
        best_weekdays.append(feat_name)

Testing:	weekday_0
Score:		0.8723	+++
Testing:	weekday_1
Score:		0.8625	---
Testing:	weekday_2
Score:		0.8592	---
Testing:	weekday_3
Score:		0.8450	---
Testing:	weekday_4
Score:		0.8389	---
Testing:	weekday_5
Score:		0.8494	---
Testing:	weekday_6
Score:		0.8637	+++


In [24]:
feat_df['hour'] = full_df['time1'].dt.hour.astype(int)

In [25]:
test_feature('hour')

Testing:	hour
Score:		0.8984	+++


True

In [26]:
test_feature('hour', onehot=True)

Testing:	hour
Score:		0.9062	+++


True

In [27]:
best_hours = []
for hour in range(23):
    feat_name = f'hour_{hour}'
    feat_df[feat_name] = (full_df['time1'].dt.hour == hour).astype(int)
    if test_feature(feat_name):
        best_hours.append(feat_name)
print(best_hours)

Testing:	hour_0
Score:		0.8631	---
Testing:	hour_1
Score:		0.8631	---
Testing:	hour_2
Score:		0.8631	---
Testing:	hour_3
Score:		0.8631	---
Testing:	hour_4
Score:		0.8631	---
Testing:	hour_5
Score:		0.8631	---
Testing:	hour_6
Score:		0.8631	---
Testing:	hour_7
Score:		0.8632	+++
Testing:	hour_8
Score:		0.8740	+++
Testing:	hour_9
Score:		0.8719	+++
Testing:	hour_10
Score:		0.8783	+++
Testing:	hour_11
Score:		0.8768	+++
Testing:	hour_12
Score:		0.8500	---
Testing:	hour_13
Score:		0.8438	---
Testing:	hour_14
Score:		0.8755	+++
Testing:	hour_15
Score:		0.8542	---
Testing:	hour_16
Score:		0.8713	+++
Testing:	hour_17
Score:		0.8727	+++
Testing:	hour_18
Score:		0.8677	+++
Testing:	hour_19
Score:		0.8638	+++
Testing:	hour_20
Score:		0.8635	+++
Testing:	hour_21
Score:		0.8639	+++
Testing:	hour_22
Score:		0.8637	+++
['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22']


In [28]:
test_multi_feature(best_hours)

Testing:	['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22']
Score:		0.9175	+++


In [31]:
test_multi_feature(best_hours+best_weekdays)

Testing:	['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'weekday_0', 'weekday_6']
Score:		0.9185	+++
