In [20]:
import numpy as np
import pandas as pd
from math import isclose
from matplotlib import pyplot as plt
from scipy.sparse import hstack, vstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from typing import Union, List

In [21]:
def write_to_submission_file(predicted_probs, out_file: str = 'to_submission.csv.zip',
                             target='target', index_label='session_id'):
    df = pd.DataFrame(predicted_probs,
                      index = np.arange(1, len(predicted_probs) + 1),
                      columns=[target])
    df.to_csv(out_file, index_label=index_label, compression="zip")

In [22]:
"""Define all type transformations in a single function"""
def convert_types(df: pd.DataFrame) -> pd.DataFrame:
    sites = [s for s in df.columns if "site" in s]
    df[sites] = df[sites].fillna(0).astype('uint16')
    times = [t for t in df.columns if "time" in t]
    df[times] = df[times].apply(pd.to_datetime)
    if 'target' in df.columns:
        df['target'] = df.target.astype('uint8')
    return df

In [23]:
import os
# os.getcwd()
# os.listdir("../../../../")

train_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv.zip')
train_df = convert_types(train_df)
train_df.sort_values(by='time1', inplace=True)

test_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv.zip')
test_df = convert_types(test_df)

In [24]:
sites = [s for s in train_df.columns if 'site' in s]
times = [t for t in train_df.columns if 'time' in t]

In [25]:
# Our target variable
y_train = train_df["target"]

# United dataframe of the initial data
full_df = pd.concat([train_df.drop("target", axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [26]:
%%time
sites_corpus = full_df[sites].to_string(header=False, index=False).split('\n')

Wall time: 12.7 s


In [27]:
%%time
tfv = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train = tfv.fit_transform(sites_corpus[:idx_split])
X_test = tfv.transform(sites_corpus[idx_split:])

X_full = vstack([X_train, X_test]).tocsr()
X_full

Wall time: 12 s


<336358x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 3251875 stored elements in Compressed Sparse Row format>

In [28]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)

def get_auc_logit_score(X, y, C=1.0, seed=17, n_splits=10):
    # Split the data into the training and validation sets
    time_split = TimeSeriesSplit(n_splits=n_splits)
    logit = LogisticRegression(C=C, random_state=17, solver='liblinear')
    cv_scores = cross_val_score(logit, X, y, cv=time_split, scoring='roc_auc', n_jobs=-1)
    return cv_scores.mean()

In [29]:
%%time
BASELINE = get_auc_logit_score(X_train, y_train)
print(f"{BASELINE:.4f}")

0.8692
Wall time: 11.5 s


In [30]:
# Features engineering
feat_df = pd.DataFrame(index=full_df.index)

In [31]:
# Add the new feature to the sparse matrix
def add_feature(feat: str, f_df: pd.DataFrame = feat_df, X_sparse = X_full, standardize=True, onehot=False):
    tmp = f_df[[feat]].values
    if onehot:
        enc = OneHotEncoder(dtype=np.uint8, sparse=False)
        tmp = enc.fit_transform(tmp)
    if standardize:
        tmp = StandardScaler().fit_transform(tmp)        
    return hstack([X_sparse, tmp]).tocsr()

In [32]:
def add_multi_feature(feat_list: list, f_df: pd.DataFrame = feat_df, X_sparse = X_full):
    X_new = X_sparse
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    return X_new[:idx_split, :], X_new[idx_split:, :]     

In [59]:
def test_feature(feat: str, standardize=True, onehot=False, baseline=BASELINE, C=1):
    print(f"Testing:\t{feat}")
        
    X_new = add_feature(feat, standardize=standardize, onehot=onehot)
    X_train = X_new[:idx_split, :]
    score = get_auc_logit_score(X_train, y_train, C=C)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print(f"+++ baseline: {baseline:.4f}")
    else:
        print(f"--- baseline: {baseline:.4f}")
    return score

In [34]:
def test_multi_feature(feat_list: list, baseline=BASELINE, C=1):    
    print(f"Testing:\t{feat_list}")
    
    X_new = X_full
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    X_train = X_new[:idx_split, :]
    score = get_auc_logit_score(X_train, y_train, C=C)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print("+++")
    else:
        print("---")
    return score
    

In [35]:
def predict_probs(feat_list: list, C=1):
    X_new = X_full
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    X_train = X_new[:idx_split, :]
    X_test = X_new[idx_split:, :]
    estimator = LogisticRegression(C=C, random_state=17, solver='liblinear')
    estimator.fit(X_train, y_train)
    return estimator.predict_proba(X_test)[:, 1]

In [66]:
# previous features
for weekday in range(7):
    feat_name = f'weekday_{weekday}'
    feat_df[feat_name] = (full_df['time1'].dt.weekday == weekday).astype(int)
for hour in range(23):
    feat_name = f'hour_{hour}'
    feat_df[feat_name] = (full_df['time1'].dt.hour == hour).astype(int)

    
feat_df['duration'] = (full_df[times].max(axis=1) - full_df[times].min(axis=1)).dt.total_seconds()
    
hour = full_df['time1'].dt.hour
feat_df['morning'] = hour.between(7, 11).astype(int)
feat_df['noon'] = hour.between(12, 18).astype(int)
feat_df['evening'] = hour.between(19, 23).astype(int)
feat_df['night'] = hour.between(0, 6).astype(int)    
    
best_hours = ['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22']
best_weekdays = ['weekday_0']
best_previous = ['morning', 'noon', 'evening']

In [36]:
# time between requests
deltas = ['delta' + str(i) for i in range(1, 10)]
delta_df = (full_df[times] - full_df[times].shift(1, axis=1)) \
                                        .copy() \
                                        .drop(columns='time1') \
                                        .apply(lambda x: x.dt.total_seconds())

delta_df.columns = deltas

delta_df.head()

Unnamed: 0,delta1,delta2,delta3,delta4,delta5,delta6,delta7,delta8,delta9
21668,0.0,,,,,,,,
54842,0.0,1784.0,2.0,,,,,,
77291,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
114020,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
146669,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [44]:
delta_df.apply(np.median, axis=1).fillna(0.0)

21668     NaN
54842     NaN
77291     0.0
114020    0.0
146669    0.0
         ... 
82792     NaN
82793     0.0
82794     0.0
82795     0.0
82796     1.0
Length: 336358, dtype: float64

In [48]:
delta_df.apply(lambda row: np.median(row.dropna()),
         axis=1)

21668       0.000000
54842     595.333333
77291       0.444444
114020      0.333333
146669      0.222222
             ...    
82792       0.000000
82793       4.333333
82794       0.888889
82795       1.444444
82796       5.555556
Length: 336358, dtype: float64

In [47]:
np.median(delta_df.values)

nan

In [64]:
feat_df['delta_mean'] = delta_df.mean(axis=1, skipna=True).fillna(0.0)
feat_df['delta_std'] = delta_df.std(axis=1, skipna=True).fillna(0.0)
feat_df['delta_median'] = delta_df.apply(lambda row: np.median(row.dropna()),
         axis=1).fillna(0.0)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [61]:
test_feature('delta_mean')

Testing:	delta_mean
Score:		0.8693	+++ baseline: 0.8692


0.8693248694058958

In [60]:
test_feature('delta_std')

Testing:	delta_std
Score:		0.8700	+++ baseline: 0.8692


0.869950795559728

In [67]:
test_feature('delta_median')

Testing:	delta_median
Score:		0.8687	--- baseline: 0.8692


0.8686523345070306

In [68]:
test_feature('duration')

Testing:	duration
Score:		0.8699	+++ baseline: 0.8692


0.8698902967308999

In [19]:
# write_to_submission_file(predict_probs(best_previous + best_deltas), out_file='logit_subm13.csv.zip')