In [1]:
import numpy as np
import pandas as pd
from math import isclose
from matplotlib import pyplot as plt
from scipy.sparse import hstack, vstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from typing import Union, List

In [2]:
def write_to_submission_file(predicted_probs, out_file: str = 'to_submission.csv.zip',
                             target='target', index_label='session_id'):
    df = pd.DataFrame(predicted_probs,
                      index = np.arange(1, len(predicted_probs) + 1),
                      columns=[target])
    df.to_csv(out_file, index_label=index_label, compression="zip")

In [3]:
"""Define all type transformations in a single function"""
def convert_types(df: pd.DataFrame) -> pd.DataFrame:
    sites = [s for s in df.columns if "site" in s]
    df[sites] = df[sites].fillna(0).astype('uint16')
    times = [t for t in df.columns if "time" in t]
    df[times] = df[times].apply(pd.to_datetime)
    if 'target' in df.columns:
        df['target'] = df.target.astype('uint8')
    return df

In [4]:
import os
# os.getcwd()
# os.listdir("../../../../")

train_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv.zip')
train_df = convert_types(train_df)
train_df.sort_values(by='time1', inplace=True)

test_df = pd.read_csv('../../../../data/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv.zip')
test_df = convert_types(test_df)

In [5]:
sites = [s for s in train_df.columns if 'site' in s]
times = [t for t in train_df.columns if 'time' in t]

In [6]:
# Our target variable
y_train = train_df["target"]

# United dataframe of the initial data
full_df = pd.concat([train_df.drop("target", axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [7]:
%%time
sites_corpus = full_df[sites].to_string(header=False, index=False).split('\n')

Wall time: 8.69 s


In [8]:
%%time
tfv = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train = tfv.fit_transform(sites_corpus[:idx_split])
X_test = tfv.transform(sites_corpus[idx_split:])

X_full = vstack([X_train, X_test]).tocsr()
X_full

Wall time: 8.62 s


<336358x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 3251875 stored elements in Compressed Sparse Row format>

In [9]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)

def get_auc_logit_score(X, y, C=1.0, seed=17, n_splits=10):
    # Split the data into the training and validation sets
    time_split = TimeSeriesSplit(n_splits=n_splits)
    logit = LogisticRegression(C=C, random_state=17, solver='liblinear')
    cv_scores = cross_val_score(logit, X, y, cv=time_split, scoring='roc_auc', n_jobs=-1)
    return cv_scores.mean()

In [10]:
%%time
BASELINE = get_auc_logit_score(X_train, y_train)
print(f"{BASELINE:.4f}")

0.8692
Wall time: 8.09 s


In [11]:
# Features engineering
feat_df = pd.DataFrame(index=full_df.index)

In [12]:
# Add the new feature to the sparse matrix
def add_feature(feat: str, f_df: pd.DataFrame = feat_df, X_sparse = X_full, standardize=True, onehot=False):
    tmp = f_df[[feat]].values
    if onehot:
        enc = OneHotEncoder(dtype=np.uint8, sparse=False)
        tmp = enc.fit_transform(tmp)
    if standardize:
        tmp = StandardScaler().fit_transform(tmp)        
    return hstack([X_sparse, tmp]).tocsr()

In [13]:
def add_multi_feature(feat_list: list, f_df: pd.DataFrame = feat_df, X_sparse = X_full):
    X_new = X_sparse
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    return X_new[:idx_split, :], X_new[idx_split:, :]     

In [14]:
def test_feature(feat: str, standardize=True, onehot=False, baseline=BASELINE, C=1):
    print(f"Testing:\t{feat}")
        
    X_new = add_feature(feat, standardize=standardize, onehot=onehot)
    X_train = X_new[:idx_split, :]
    score = get_auc_logit_score(X_train, y_train, C=C)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print("+++")
    else:
        print("---")
    return score

In [15]:
def test_multi_feature(feat_list: list, baseline=BASELINE, C=1):    
    print(f"Testing:\t{feat_list}")
    
    X_new = X_full
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    X_train = X_new[:idx_split, :]
    score = get_auc_logit_score(X_train, y_train, C=C)
    
    print(f"Score:\t\t{score:.4f}\t", end="")
    if score > baseline:
        print("+++")
    else:
        print("---")
    return score
    

In [16]:
def predict_probs(feat_list: list, C=1):
    X_new = X_full
    for feat in feat_list:
        X_new = add_feature(feat, X_sparse=X_new)
    X_train = X_new[:idx_split, :]
    X_test = X_new[idx_split:, :]
    estimator = LogisticRegression(C=C, random_state=17, solver='liblinear')
    estimator.fit(X_train, y_train)
    return estimator.predict_proba(X_test)[:, 1]

In [17]:
# previous features
for weekday in range(7):
    feat_name = f'weekday_{weekday}'
    feat_df[feat_name] = (full_df['time1'].dt.weekday == weekday).astype(int)
for hour in range(23):
    feat_name = f'hour_{hour}'
    feat_df[feat_name] = (full_df['time1'].dt.hour == hour).astype(int)
best_hours = ['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22']
best_weekdays = ['weekday_0']
best_previous = best_hours + best_weekdays

In [18]:
# time between requests
deltas = ['delta' + str(i) for i in range(1, 10)]
delta_df = (full_df[times] - full_df[times].shift(1, axis=1)) \
                                        .copy() \
                                        .drop(columns='time1') \
                                        .apply(lambda x: x.dt.total_seconds())

delta_df.columns = deltas

delta_target = delta_df[:idx_split].loc[y_train == 1]
delta_rest = delta_df[:idx_split].loc[y_train == 0]

delta_df.head()

Unnamed: 0,delta1,delta2,delta3,delta4,delta5,delta6,delta7,delta8,delta9
21668,0.0,,,,,,,,
54842,0.0,1784.0,2.0,,,,,,
77291,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
114020,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
146669,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [19]:
%%time
# find optimal threshold for each delta
threshold_range = np.logspace(1.69, 3, 10)
threshold ={}
best_deltas = []

for feat in deltas:    
    scores = np.empty(0)
    for t in threshold_range:
        feat_df[feat] = (delta_df[feat] > t).astype(int)
        print(f"{t:.1f} ", end="")
        scores = np.append(scores, test_feature(feat))
    if scores.max() > BASELINE:
        best_t = threshold_range[scores.argmax()]
        threshold[feat] = best_t
        feat_df[feat] = (delta_df[feat] > best_t).astype(int)
        best_deltas.append(feat)
    else:
        feat_df[feat] = (delta_df[feat] > best_t).astype(int)

49.0 Testing:	delta1
Score:		0.8683	---
68.5 Testing:	delta1
Score:		0.8684	---
95.7 Testing:	delta1
Score:		0.8685	---
133.9 Testing:	delta1
Score:		0.8687	---
187.2 Testing:	delta1
Score:		0.8692	---
261.7 Testing:	delta1
Score:		0.8690	---
365.9 Testing:	delta1
Score:		0.8691	---
511.6 Testing:	delta1
Score:		0.8693	+++
715.2 Testing:	delta1
Score:		0.8688	---
1000.0 Testing:	delta1
Score:		0.8694	+++
49.0 Testing:	delta2
Score:		0.8684	---
68.5 Testing:	delta2
Score:		0.8682	---
95.7 Testing:	delta2
Score:		0.8684	---
133.9 Testing:	delta2
Score:		0.8692	---
187.2 Testing:	delta2
Score:		0.8690	---
261.7 Testing:	delta2
Score:		0.8692	+++
365.9 Testing:	delta2
Score:		0.8692	---
511.6 Testing:	delta2
Score:		0.8688	---
715.2 Testing:	delta2
Score:		0.8692	---
1000.0 Testing:	delta2
Score:		0.8691	---
49.0 Testing:	delta3
Score:		0.8689	---
68.5 Testing:	delta3
Score:		0.8695	+++
95.7 Testing:	delta3
Score:		0.8697	+++
133.9 Testing:	delta3
Score:		0.8695	+++
187.2 Testing:	delta3
S

In [20]:
test_multi_feature(best_deltas)

Testing:	['delta1', 'delta2', 'delta3', 'delta4', 'delta5', 'delta6', 'delta7', 'delta8', 'delta9']
Score:		0.8710	+++


0.8709635787781711

In [22]:
print(threshold)
print(best_deltas)

{'delta1': 1000.0, 'delta2': 261.6843664428445, 'delta3': 95.74389938243026, 'delta4': 68.47870764059294, 'delta5': 48.97788193684461, 'delta6': 95.74389938243026, 'delta7': 365.87550369900333, 'delta8': 48.97788193684461, 'delta9': 95.74389938243026}
['delta1', 'delta2', 'delta3', 'delta4', 'delta5', 'delta6', 'delta7', 'delta8', 'delta9']


In [25]:
test_multi_feature(best_previous + best_deltas)

Testing:	['hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_14', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'weekday_0', 'delta1', 'delta2', 'delta3', 'delta4', 'delta5', 'delta6', 'delta7', 'delta8', 'delta9']
Score:		0.9206	+++


0.9205630918765794

In [24]:
write_to_submission_file(predict_probs(best_previous + best_deltas), out_file='logit_subm12.csv.zip')