# Read train and test dasets

Write function for data download and processing

In [2]:
import pandas as pd
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_columns', 100)

def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

Download and process data

In [5]:
%%time
import os

PATH_TO_DATA = ''
SEED = 88

X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)

Wall time: 1min


# Train a Logistic Regression model

In [13]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LogisticRegression

tss = TimeSeriesSplit(n_splits=10)
lr = LogisticRegression(C=1, random_state=SEED, solver='liblinear')
cv1 = cross_val_score(lr, X_train_sites, y_train, cv=tss, n_jobs=7, scoring='roc_auc')
np.mean(cv1), np.std(cv1)

(0.8625366936681628, 0.07455710081058183)

# Predict test target values

In [9]:
lr.fit(X_train, y_train)
prediction = lr.predict_proba(X_test)[:,1]
prediction = pd.Series(prediction)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Write prediction to a file

In [10]:
def write_submission_to_file(filename, prediction, test_df):
    submission = pd.DataFrame()
    submission['session_id'] = test_df['session_id'].copy()
    submission['target'] = prediction
    submission.to_csv(filename, index=None)
    
write_submission_to_file('submission.csv', prediction, test_df)

We have got a 0.90744 score on Kaggle. Let's impove it by adding time features.

# Add new features

* hour of session start
* part of day: morning, noon, evening, night
* number of visited sites (not unique) in session 
* length of session in seconds
* average time of site visit in seconds

# Add date and hour of session start

In [11]:
from sklearn.preprocessing import scale

train_df['start_month'] = scale(train_df['time1'].map(lambda x: 100*x.year + x.month).astype(int))
test_df['start_month'] = scale(test_df['time1'].map(lambda x: 100*x.year + x.month).astype(int))

train_df['start_hour'] = train_df['time1'].dt.hour.astype(int)
test_df['start_hour'] = test_df['time1'].dt.hour.astype(int)

# Add part of day

In [12]:
def determine_part_of_day(value):
    if value < 11:
        return 1
    else:
        return 0
    
train_df['part_of_day'] = train_df['start_hour'].apply(determine_part_of_day)
test_df['part_of_day'] = test_df['start_hour'].apply(determine_part_of_day)

# Determine last visit time for each session

In [13]:
null_time = pd.to_datetime(0)

def determine_last_visit_time(row):
    last_visit = null_time
    for value in row:
        if value == null_time:
            return last_visit
        else:
            last_visit = value
    return last_visit

train_df['last_visit'] = train_df[time_columns].apply(determine_last_visit_time, axis=1)
test_df['last_visit'] = test_df[time_columns].apply(determine_last_visit_time, axis=1)

# Add length of session in seconds

In [48]:
train_df['session_length'] = scale((train_df['last_visit'] - train_df['time1']).dt.total_seconds().astype(int))
test_df['session_length'] = scale((test_df['last_visit'] - test_df['time1']).dt.total_seconds().astype(int))

# Add num of visited sites

In [30]:
import numpy as np

train_df['num_of_sites'] = train_df[site_columns].apply(np.count_nonzero, axis=1)
test_df['num_of_sites'] = test_df[site_columns].apply(np.count_nonzero, axis=1)

# Add full session (10 sites visited) flag

In [33]:
train_df['full_session'] = train_df['num_of_sites'].apply(lambda x: 1 if x == 10 else 0)
test_df['full_session'] = train_df['num_of_sites'].apply(lambda x: 1 if x == 10 else 0)

# Add new feature columns to  train and test dataset

In [37]:
from scipy.sparse import hstack
from sklearn.preprocessing import scale

train_df['start_hour'] = pd.Categorical(train_df['start_hour'])
train_dummy = pd.get_dummies(train_df[['start_hour']])
train_dummy = pd.concat([train_dummy, train_df[['session_length', 'full_session', 
                                                'part_of_day', 'start_month']]], axis=1)

test_df['start_hour'] = pd.Categorical(test_df['start_hour'])
test_dummy = pd.get_dummies(test_df[['start_hour']])
test_dummy = pd.concat([test_dummy, test_df[['session_length', 'full_session', 
                                             'part_of_day', 'start_month']]], axis=1)

# Create new train and test datasets with additional features

In [38]:
X_train_1 = hstack([X_train, train_dummy])
X_test_1 = hstack([X_test, test_dummy])
X_train_1.shape, X_test_1.shape

((253561, 48392), (82797, 48392))

# Train Logistic Regression model on new data

In [43]:
from sklearn.model_selection import TimeSeriesSplit

lr = LogisticRegression(max_iter=100, C=1, random_state=88)
tss = TimeSeriesSplit(n_splits=5)
cv = cross_val_score(lr, X_train_1, y_train, cv=tss, n_jobs=7, scoring='roc_auc')
np.mean(cv), np.std(cv)

(0.9816438826419759, 0.0044373698543822325)

# Predict test target values

In [49]:
lr = LogisticRegression(max_iter=100, C=1, random_state=88)
lr.fit(X_train_1, y_train)
prediction_1 = lr.predict_proba(X_test_1)[:,1]
prediction_1 = pd.Series(prediction_1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Write prediction to a file

In [46]:
write_submission_to_file('submission_1.csv', prediction_1, test_df) # 0.93954

# Change features

# Search for the best hyperparameters of model

In [47]:
from sklearn.model_selection import RandomForestClassifier

C_list = np.logspace(-3, 1, 10)
param_grid_lr = {'C': C_list}

lr = LogisticRegression(max_iter=100, random_state=88)
grid_lr = GridSearchCV(lr, param_grid_lr, return_train_score=True, cv=tss, n_jobs=7, scoring = 'roc_auc', verbose=10)
grid_lr.fit(X_train_1, y_train)
print((grid_lr.best_params_, grid_lr.best_score_))

NameError: name 'GridSearchCV' is not defined

# Predict test target values with optimized hyperparameters

In [149]:
lr = LogisticRegression(max_iter=1000, C=1.29, random_state=88, solver='liblinear')
lr.fit(X_train_1, y_train)
prediction_2 = lr.predict_proba(X_test_1)[:,1]
prediction_2 = pd.Series(prediction_1)

# Write prediction to a file

In [150]:
write_submission_to_file('submission_2.csv', prediction_2, test_df) # 0.93891

# Search for a best Hyperparameters of Random Forest model

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# criterion_list = ['gini', 'entropy']
# min_samples_split_list = [2, 4, 6, 8, 10]
# class_weight_list = ['balanced', {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:100}, None]
# param_grid_rf = {'criterion': criterion_list, 'min_samples_split': min_samples_split_list, 
#                  'class_weight': class_weight_list}

# rf = RandomForestClassifier()
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

# Additional Hyperparameter optimization for Random Forest model

Increase number of estimators

In [None]:
# rf = RandomForestClassifier(class_weight = None, min_samples_split=10, n_estimators=1000, criterion='entropy', random_state=17)

# cv = cross_val_score(rf, X, y, scoring='roc_auc', cv=4, n_jobs=7, verbose=10)
# np.mean(cv)

Additional Grid Search for hyperparameter *n_estimators*=1000

In [None]:
# min_samples_split_list = [9, 10, 11]
# class_weight_list = ['balanced', None, {0:1, 1:75}]
# param_grid_rf = {'min_samples_split': min_samples_split_list, 'class_weight': class_weight_list}

# rf = RandomForestClassifier(n_estimators=1000, criterion='entropy')
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

# Predict test target with optimized Random Forest