In [70]:
# Import libraries and set desired options
import pickle
from pathlib import Path

import os
import glob
from zipfile import ZipFile

import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt

from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

import scipy.stats as st

In [2]:
PATH_TO_DATA = Path("../../_static/data/assignment4")


DATA_FOLDER = "../../data/"
DATA_ZIP_FILE = "catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2.zip"
TEMP_FOLDER = "temp/"

In [3]:
# loading the data.zip and creating a zip object 
with ZipFile(
    os.path.join(DATA_FOLDER, DATA_ZIP_FILE),
    "r"
) as zObject: 
  
    # Extracting all the members of the zip  
    # into a specific temporary location
    zObject.extractall( 
        path=os.path.join(DATA_FOLDER, TEMP_FOLDER)
    ) 

In [4]:
# Listing the files in temporary folder
files = [
    os.path.basename(file).split("data/")[0] for file in glob.glob(
        os.path.join(
            DATA_FOLDER,
            TEMP_FOLDER,
            "*"
        )
    )
]

In [5]:
# Read the training and test data sets, change paths if needed
times = ["time%s" % i for i in range(1, 11)]
sites = ["site%s" % i for i in range(1, 11)]

# customize the paths if needed
train_df = pd.read_csv(
    DATA_FOLDER + TEMP_FOLDER + 'train_sessions.csv',
    index_col="session_id",
    parse_dates=times
)

test_df = pd.read_csv(
    DATA_FOLDER + TEMP_FOLDER + 'test_sessions.csv', 
    index_col="session_id",
    parse_dates=times
)

# Sort the data by time
train_df = train_df.sort_values(by=times[0])

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [6]:
# Load websites dictionary
with open(DATA_FOLDER + TEMP_FOLDER + "site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
df_sites = pd.DataFrame(
    list(site_dict.keys()), index=list(site_dict.values()), columns=["site"]
)
df_sites.head()

Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [7]:
# Deleting temporary files
for file in files:
    try:
        os.remove(os.path.join(DATA_FOLDER, TEMP_FOLDER, file))
    except:
        print(f"{file} does not exist")

# Deleting temporary folder
try:
    os.rmdir(os.path.join(DATA_FOLDER, TEMP_FOLDER))
except:
    print("Folder does not exist")

The training data set contains the following features:

- **site1** – id of the first visited website in the session
- **time1** – visiting time for the first website in the session
- ...
- **site10** – id of the tenth visited website in the session
- **time10** – visiting time for the tenth website in the session
- **target** – target variable, 1 for Alice's sessions, and 0 for the other users' sessions
    
User sessions are chosen in the way that they are shorter than 30 min. long and contain no more than 10 websites. I.e. a session is considered over either if a user has visited 10 websites or if a session has lasted over 30 minutes.

There are some empty values in the table, it means that some sessions contain less than ten websites. Replace empty values with 0 and change columns types to integer. Also load the websites dictionary and check how it looks like:

In [8]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

print(u"Websites total:", df_sites.shape[0])
df_sites.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [9]:
id2site = {v:k for (k, v) in site_dict.items()}
id2site[0] = 'unkown'

In [10]:
y_train = train_df["target"]
X_train = train_df.drop(columns='target')
X_test = test_df[sites]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_params={
    'ngram_range': (1, 5), 
    'max_features': 50000,
    #'tokenizer': lambda s: s.split()
}

vectorizer = TfidfVectorizer(**vectorizer_params)
vectorizer.fit_transform(X_train)

<20x20 sparse matrix of type '<class 'numpy.float64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [12]:
X_train_ = X_train[sites].fillna(0).astype('int').apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).to_list()

vectorizer_params={
    'ngram_range': (1, 5), 
    'max_features': 50000,
    'token_pattern': None,
    'tokenizer': lambda s: s.split()
}

vectorizer = TfidfVectorizer(**vectorizer_params)
X_train_ = vectorizer.fit_transform(X_train_)
X_train_.shape

(253561, 50000)

In [13]:
X_train_

<253561x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 4112773 stored elements in Compressed Sparse Row format>

In [14]:
vectorizer.get_feature_names_out()[10000:10010]

array(['de-img3.ask.fm', 'de-img4.ask.fm', 'de.tynt.com',
       'de.tynt.com api.adyoulike.com',
       'de.tynt.com api.adyoulike.com syndication.twitter.com',
       'de.tynt.com api.adyoulike.com twitter.com',
       'de.tynt.com platform.twitter.com',
       'de.tynt.com platform.twitter.com twitter.com',
       'de.tynt.com premium.hi-mediaserver.com',
       'de.tynt.com syndication.twitter.com'], dtype=object)

In [15]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
time_split = TimeSeriesSplit(n_splits=10)

In [16]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
cv_scores = cross_val_score(
    logit,
    X_train[sites],
    y_train,
    cv=time_split,
    scoring='roc_auc',
    n_jobs=-1
)



In [17]:
cv_scores, cv_scores.mean()

(array([0.55300658, 0.37896032, 0.38159371, 0.33619252, 0.39875617,
        0.47387654, 0.39927177, 0.4960409 , 0.34637032, 0.49812401]),
 0.4262192848501769)

In [18]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
cv_scores = cross_val_score(
    logit,
    X_train_,
    y_train,
    cv=time_split,
    scoring='roc_auc',
    n_jobs=-1
)

In [19]:
cv_scores, cv_scores.mean()

(array([0.83124023, 0.65993466, 0.85673565, 0.92824237, 0.84779639,
        0.88954524, 0.88829128, 0.87710523, 0.92023038, 0.92624125]),
 0.8625362694151277)

In [20]:
b = [0, 6, 11, 18, 24]
l = ['Night', 'Morning','Day','Evening']

time = X_train['time1'].apply(lambda row: row.hour)
time = pd.cut(time, bins=b, labels=l, include_lowest=True)
time = pd.get_dummies(time).astype('int').values
time

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1]])

In [21]:
X_train__ = hstack([X_train_, time])

In [22]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
cv_scores = cross_val_score(
    logit,
    X_train__,
    y_train,
    cv=time_split,
    scoring='roc_auc',
    n_jobs=-1
)

cv_scores, cv_scores.mean()

(array([0.88170019, 0.81426901, 0.91861447, 0.96171465, 0.91532998,
        0.95227476, 0.92804771, 0.94016178, 0.94665449, 0.95277436]),
 0.921154139427846)

In [71]:
c_values

array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])

In [80]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(
    estimator=logit,
    param_grid={'C': c_values},
    scoring='roc_auc',
    n_jobs=-1,
    cv=time_split,
)

logit_grid_searcher.fit(X_train__, y_train)

logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

KeyboardInterrupt: 

In [None]:
np.round(y_train.sum() / y_train.shape[0] * 100, 2)

0.91

In [104]:
def pr_auc(y_true, y_pred):
    from sklearn.metrics import precision_recall_curve
    from sklearn.metrics import auc
    
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    auc_score = auc(recall, precision)
    
    return auc_score

In [105]:
logit = LogisticRegression(
    random_state=17,
    solver='lbfgs',
    max_iter=500
)

time_cv = TimeSeriesSplit(n_splits=5)

In [118]:
%%capture
from sklearn.metrics import make_scorer
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sfs = SFS(
    logit,
    k_features='best',
    forward=True,
    floating=True,
    verbose=0,
    scoring='f1_micro',
    cv=time_cv,
    n_jobs=-1,
)

X_train_ = sfs.fit_transform(X_train[sites].fillna(0), y_train)



In [119]:
sfs.k_feature_names_

('site1',)

In [120]:
sfs.k_score_

0.9923615712257454

In [121]:
X_train_1 = pd.DataFrame(X_train_).apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).to_list()

vectorizer_params={
    'ngram_range': (1, 3), 
    'max_features': 50000,
    'token_pattern': None,
    'tokenizer': lambda s: s.split()
}

vectorizer = TfidfVectorizer(**vectorizer_params)
X_train_2 = vectorizer.fit_transform(X_train_1)
X_train_2.shape

(253561, 15765)

In [122]:
b = [0, 6, 11, 18, 24]
l = ['Night', 'Morning','Day','Evening']

time = X_train['time1'].apply(lambda row: row.hour)
time = pd.cut(time, bins=b, labels=l, include_lowest=True)
time = pd.get_dummies(time).astype('int').values

X_train_3 = hstack([X_train_2, time])
X_train_3.shape

(253561, 15769)

In [123]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = RandomizedSearchCV(
    estimator=logit,
    param_distributions={'C': c_values},
    scoring='f1_micro',
    n_jobs=-1,
    cv=time_split,
    random_state=17,
)

logit_grid_searcher.fit(X_train_3, y_train)

logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9921001258079911, {'C': 0.01})