In [1]:
# Import libraries and set desired options
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

sns.set()
%config InlineBackend.figure_format = 'retina'

In [2]:
PATH_TO_DATA = Path("../../_static/data/assignment4")

In [3]:
# Read the training and test data sets, change paths if needed
times = ["time%s" % i for i in range(1, 11)]
# customize the paths if needed
train_df = pd.read_csv(
    PATH_TO_DATA / "train_sessions.csv.zip", index_col="session_id", parse_dates=times
)
test_df = pd.read_csv(
    PATH_TO_DATA / "test_sessions.csv.zip", index_col="session_id", parse_dates=times
)

# Sort the data by time
train_df = train_df.sort_values(by="time1")

# Look at the first rows of the training set
train_df.head(2)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0


In [4]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ["site%s" % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open(PATH_TO_DATA / "site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(
    list(site_dict.keys()), index=list(site_dict.values()), columns=["site"]
)
print(u"Websites total:", sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [5]:
# Top websites in the training data set
top_sites = (
    pd.Series(train_df[sites].values.flatten())
    .value_counts()
    .sort_values(ascending=False)
    .head(5)
)
print(top_sites)
sites_dict.loc[top_sites.drop(0).index]

21     123776
0      122730
23      87619
782     77055
22      58258
dtype: int64


Unnamed: 0,site
21,www.google.fr
23,www.google.com
782,annotathon.org
22,apis.google.com


In [6]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [7]:
# Create a separate dataframe where we will work with timestamps
time_df = pd.DataFrame(index=train_df.index)
time_df["target"] = train_df["target"]

# Find sessions' starting and ending
time_df["min"] = train_df[times].min(axis=1)
time_df["max"] = train_df[times].max(axis=1)

# Calculate sessions' duration in seconds
time_df["seconds"] = (time_df["max"] - time_df["min"]) / np.timedelta64(1, "s")

time_df.head()

Unnamed: 0_level_0,target,min,max,seconds
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21669,0,2013-01-12 08:05:57,2013-01-12 08:05:57,0.0
54843,0,2013-01-12 08:37:23,2013-01-12 09:07:09,1786.0
77292,0,2013-01-12 08:50:13,2013-01-12 08:50:17,4.0
114021,0,2013-01-12 08:50:17,2013-01-12 08:50:20,3.0
146670,0,2013-01-12 08:50:20,2013-01-12 08:50:22,2.0


In [8]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [9]:
# Our target variable
y_train = train_df["target"]

# United dataframe of the initial data
full_df = pd.concat([train_df.drop("target", axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [10]:
# Dataframe with indices of visited websites in session
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [11]:
# sequence of indices
sites_flatten = full_sites.values.flatten()

# and the matrix we are looking for
# (make sure you understand which of the `csr_matrix` constructors is used here)
# a further toy example will help you with it
full_sites_sparse = csr_matrix(
    (
        [1] * sites_flatten.shape[0],
        sites_flatten,
        range(0, sites_flatten.shape[0] + 10, 10),
    )
)[:, 1:]

In [12]:
full_sites_sparse.shape

(336358, 48371)

In [13]:
# How much memory does a sparse matrix occupy?
print(
    "{} elements * {} bytes = {} bytes".format(
        full_sites_sparse.count_nonzero(), 8, full_sites_sparse.count_nonzero() * 8
    )
)
# Or just like this:
print("sparse_matrix_size = {} bytes".format(full_sites_sparse.data.nbytes))

1866898 elements * 8 bytes = 14935184 bytes
sparse_matrix_size = 14935184 bytes


In [14]:
# data, create the list of ones, length of which equal to the number of elements in the initial dataframe (9)
# By summing the number of ones in the cell, we get the frequency,
# number of visits to a particular site per session
data = [1] * 9

# To do this, you need to correctly distribute the ones in cells
# Indices - website ids, i.e. columns of a new matrix. We will sum ones up grouping them by sessions (ids)
indices = [1, 0, 0, 1, 3, 1, 2, 3, 4]

# Indices for the division into rows (sessions)
# For example, line 0 is the elements between the indices [0; 3) - the rightmost value is not included
# Line 1 is the elements between the indices [3; 6)
# Line 2 is the elements between the indices [6; 9)
indptr = [0, 3, 6, 9]

# Aggregate these three variables into a tuple and compose a matrix
# To display this matrix on the screen transform it into the usual "dense" matrix
csr_matrix((data, indices, indptr)).todense()

matrix([[2, 1, 0, 0, 0],
        [0, 2, 0, 1, 0],
        [0, 0, 1, 1, 1]])

In [15]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [16]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio=0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver="liblinear").fit(
        X[:idx, :], y[:idx]
    )
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)

    return score

In [17]:
%%time
# Select the training set from the united dataframe (where we have the answers)
X_train = full_sites_sparse[:idx_split, :]

# Calculate metric on the validation set
print(get_auc_lr_valid(X_train, y_train))

0.9195230491186374
CPU times: user 13.3 s, sys: 2.02 s, total: 15.3 s
Wall time: 2.71 s


In [18]:
# Function for writing predictions to a file
def write_to_submission_file(
    predicted_labels, out_file, target="target", index_label="session_id"
):
    predicted_df = pd.DataFrame(
        predicted_labels,
        index=np.arange(1, predicted_labels.shape[0] + 1),
        columns=[target],
    )
    predicted_df.to_csv(out_file, index_label=index_label)

In [19]:
# Train the model on the whole training data set
# Use random_state=17 for repeatability
# Parameter C=1 by default, but here we set it explicitly
lr = LogisticRegression(C=1.0, random_state=17, solver="liblinear").fit(
    X_train, y_train
)

# Make a prediction for test data set
X_test = full_sites_sparse[idx_split:, :]
y_test = lr.predict_proba(X_test)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, "baseline_1.csv")

In [20]:
# Dataframe for new features
full_new_feat = pd.DataFrame(index=full_df.index)

# Add start_month feature
full_new_feat["start_month"] = (
    full_df["time1"].apply(lambda ts: 100 * ts.year + ts.month).astype("float64")
)

In [21]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [22]:
# Add the new feature to the sparse matrix
tmp = full_new_feat[["start_month"]].values
X_train = csr_matrix(hstack([full_sites_sparse[:idx_split, :], tmp[:idx_split, :]]))

# Compute the metric on the validation set
print(get_auc_lr_valid(X_train, y_train))

0.7508354860175162


In [23]:
# Add the new standardized feature to the sparse matrix
tmp = StandardScaler().fit_transform(full_new_feat[["start_month"]])
X_train = csr_matrix(hstack([full_sites_sparse[:idx_split, :], tmp[:idx_split, :]]))

# Compute metric on the validation set
print(get_auc_lr_valid(X_train, y_train))

0.9196993699549294


In [24]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [25]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)
full_new_feat["morning"] = 0  # change this
full_new_feat["start_hour"] = 0  # change this

In [26]:
# Compose the training set
tmp_scaled = StandardScaler().fit_transform(
    full_new_feat[["start_month", "start_hour", "morning"]]
)
X_train = csr_matrix(
    hstack([full_sites_sparse[:idx_split, :], tmp_scaled[:idx_split, :]])
)

# Capture the quality with default parameters
score_C_1 = get_auc_lr_valid(X_train, y_train)
print(score_C_1)

0.9196984641972088


In [27]:
# List of possible C-values
Cs = np.logspace(-3, 1, 10)

# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [28]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [29]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)
C = 1  # change this

In [30]:
# Prepare the training and test data
tmp_scaled = StandardScaler().fit_transform(
    full_new_feat[["start_month", "start_hour", "morning"]]
)
X_train = csr_matrix(
    hstack([full_sites_sparse[:idx_split, :], tmp_scaled[:idx_split, :]])
)
X_test = csr_matrix(
    hstack([full_sites_sparse[idx_split:, :], tmp_scaled[idx_split:, :]])
)

# Train the model on the whole training data set using optimal regularization parameter
lr = LogisticRegression(C=C, random_state=17, solver="liblinear").fit(X_train, y_train)

# Make a prediction for the test set
y_test = lr.predict_proba(X_test)[:, 1]

# Write it to the submission file
write_to_submission_file(y_test, "baseline_2.csv")