# Function for data download and processing

In [1]:
import pandas as pd
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_columns', 100)

def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

# Download and process data

In [2]:
%%time
import os

PATH_TO_DATA = ''
SEED = 88

X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)

CPU times: user 17.8 s, sys: 635 ms, total: 18.4 s
Wall time: 19 s


# Train a Logistic Regression model

In [3]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LogisticRegression

tss = TimeSeriesSplit(n_splits=10)
lr = LogisticRegression(C=1, random_state=SEED, solver='liblinear')
cv1 = cross_val_score(lr, X_train_sites, y_train, cv=tss, n_jobs=7, scoring='roc_auc')
np.mean(cv1), np.std(cv1)

(0.8625362694151277, 0.07455672437604233)

# Fit the model

In [9]:
lr.fit(X_train_sites, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=88, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

# Display model weights with eli5

In [10]:
import eli5

eli5.show_weights(estimator=lr, 
                  feature_names=vectorizer.get_feature_names(), top=30)

Weight?,Feature
+5.880,youwatch.org
+5.380,cid-ed6c3e6a5c6608a4.users.storage.live.com
+5.222,fr.glee.wikia.com
+5.114,vk.com
+4.875,www.info-jeunes.net
+4.499,www.banque-chalus.fr
+4.220,www.express.co.uk
+4.147,www.audienceinsights.net
+4.089,www.melty.fr
+4.003,glee.hypnoweb.net


# Functions for automatization of model test and writing submission to file

In [17]:
from IPython.display import display_html

def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=tss, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores

cv_scores1 = train_and_predict(model=lr, X_train=X_train_sites, y_train=y_train, 
                               X_test=X_test_sites, site_feature_names=vectorizer.get_feature_names(),              
                               cv=tss, submission_file_name='subm1.csv')

CV scores [0.83124023 0.65993466 0.85673565 0.92824237 0.84779639 0.88954524
 0.88829128 0.87710535 0.92023038 0.92624209]
CV mean: 0.8625363644145931, CV std: 0.07455679771627693


Weight?,Feature
+5.880,youwatch.org
+5.380,cid-ed6c3e6a5c6608a4.users.storage.live.com
+5.222,fr.glee.wikia.com
+5.114,vk.com
+4.875,www.info-jeunes.net
+4.499,www.banque-chalus.fr
+4.220,www.express.co.uk
+4.147,www.audienceinsights.net
+4.089,www.melty.fr
+4.003,glee.hypnoweb.net


We have got a 0.90744 score on Kaggle. Let's impove it by adding time features.

# Add new features

* hour of session start
* part of day: morning, noon, evening, night
* number of visited sites (not unique) in session 
* length of session in seconds
* average time of site visit in seconds

# Add date and hour of session start

In [70]:
from sklearn.preprocessing import scale

train_times['start_hour'] = train_times['time1'].dt.hour.astype(int)
train_times['start_hour'] = pd.Categorical(train_times['start_hour'])
test_times['start_hour'] = test_times['time1'].dt.hour.astype(int)
test_times['start_hour'] = pd.Categorical(test_times['start_hour'])

# Determine last visit time for each session

In [36]:
null_time = pd.to_datetime(0)
times = ['time%s' % i for i in range(1, 11)]

def determine_last_visit_time(row):
    last_visit = null_time
    for value in row:
        if pd.isnull(value):
            return last_visit
        else:
            last_visit = value
    return last_visit

train_times['last_visit'] = train_times[times].apply(determine_last_visit_time, axis=1)
test_times['last_visit'] = test_times[times].apply(determine_last_visit_time, axis=1)

# Add part of day

In [84]:
def determine_part_of_day(hour):
    if hour >= 7 and hour <= 11:
        return 'morning'
    elif hour >= 12 and hour <= 18:
        return 'day'
    elif hour >= 18 and hour <= 23:
        return 'evening'
    else:
        return 'night'
    
train_times['part_of_day'] = train_times['start_hour'].apply(determine_part_of_day)
train_times['part_of_day'] = pd.Categorical(train_times['part_of_day'])
test_times['part_of_day'] = test_times['start_hour'].apply(determine_part_of_day)
train_times['part_of_day'] = pd.Categorical(train_times['part_of_day'])

# Add length of session in seconds

In [40]:
train_times['session_length'] = scale((train_times['last_visit'] - train_times['time1']).dt.total_seconds().astype(int))
test_times['session_length'] = scale((test_times['last_visit'] - test_times['time1']).dt.total_seconds().astype(int))

# Add num of visited sites

In [60]:
import numpy as np

train_times['num_of_sites'] = train_times[times].count(axis=1)
test_times['num_of_sites'] = test_times[times].count()

# Add full session (10 sites visited) flag

In [64]:
train_times['full_session'] = train_times['num_of_sites'].apply(lambda x: 1 if x == 10 else 0)
test_times['full_session'] = test_times['num_of_sites'].apply(lambda x: 1 if x == 10 else 0)

# Add new feature columns to  train and test dataset

In [72]:
from scipy.sparse import hstack
from sklearn.preprocessing import scale

def add_new_features(df, features, cat_features=None):
    new_df = pd.DataFrame()
    if cat_features:
        new_df = pd.concat([new_df, pd.get_dummies(df[cat_features])], axis=1)
    new_df = pd.concat([new_df, df[features]], axis=1)
    return new_df

# Create new train and test datasets with additional features

In [92]:
new_train_features = add_new_features(train_times, ['full_session'], ['part_of_day'])
new_test_features = add_new_features(test_times, ['full_session'], ['part_of_day'])

X_train_sites_with_time = hstack([X_train_sites, new_train_features])
X_test_sites_with_time = hstack([X_test_sites, new_test_features])

X_train_sites_with_time.shape, X_test_sites_with_time.shape

((253561, 50004), (82797, 50004))

# Train Logistic Regression model on new data

In [135]:
new_features = list(new_train_features.columns)

cv_scores2 = train_and_predict(model=lr, X_train=X_train_sites_with_time, y_train=y_train, 
                               X_test=X_test_sites_with_time, site_feature_names=vectorizer.get_feature_names(),              
                               new_feature_names=new_features, cv=tss, submission_file_name='subm2.csv')

CV scores [0.86809822 0.81821867 0.91950263 0.96171682 0.91593193 0.95368912
 0.93027726 0.94128874 0.94847477 0.95287352]
CV mean: 0.9210071663248224, CV std: 0.04287825722876893


Weight?,Feature
+5.212,youwatch.org
+5.058,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.974,vk.com
+4.969,www.express.co.uk
+4.617,fr.glee.wikia.com
+4.532,www.info-jeunes.net
+4.363,www.melty.fr
+4.148,www.banque-chalus.fr
+4.146,www.audienceinsights.net
+3.792,r4---sn-gxo5uxg-jqbe.googlevideo.com


New feature weights:
               feature      coef
0      part_of_day_day  0.509579
1  part_of_day_evening -2.722820
2  part_of_day_morning -3.230468
3       session_length -0.241711


In [96]:
cv_scores2 > cv_scores1

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

This model hits 0.94623 at public score. Let's add additional features like day of week, month and year

# Add new time features

In [120]:
train_times['day_of_week'] = train_times['time1'].apply(lambda t: t.weekday())
train_times['day_of_week'] = pd.Categorical(train_times['day_of_week'])
test_times['day_of_week'] = test_times['time1'].apply(lambda t: t.weekday())
test_times['day_of_week'] = pd.Categorical(test_times['day_of_week'])

train_times['month'] = train_times['time1'].dt.month
train_times['month'] = pd.Categorical(train_times['month'])
test_times['month'] = test_times['time1'].dt.month
test_times['month'] = pd.Categorical(test_times['month'])

train_times['year'] = train_times['time1'].dt.year
train_times['year'] = pd.Categorical(train_times['year'])
test_times['year'] = test_times['time1'].dt.year
test_times['year'] = pd.Categorical(test_times['year'])

train_times['year_month'] = scale(train_times['time1'].dt.year.astype(int)*100 + \
train_times['time1'].dt.month.astype(int))
test_times['year_month'] = scale(test_times['time1'].dt.year.astype(int)*100 + \
test_times['time1'].dt.month.astype(int))

# Create new train and test datasets with additional features

In [132]:
new_train_features1 = add_new_features(train_times, ['full_session', 'year_month'], 
                                      ['part_of_day', 'day_of_week', 'month', 'year'])
new_test_features1 = add_new_features(test_times, ['full_session', 'year_month'], 
                                      ['part_of_day', 'day_of_week', 'month', 'year'])

# drop in train dataset columns which are missing in test dataset
cols_to_drop = [col for col in new_train_features1.columns if col not in new_test_features1.columns]
new_train_features1 = new_train_features1.drop(cols_to_drop, axis=1)

X_train_sites_with_time1 = hstack([X_train_sites, new_train_features1])
X_test_sites_with_time1 = hstack([X_test_sites, new_test_features1])

X_train_sites_with_time1.shape, X_test_sites_with_time1.shape

((253561, 50021), (82797, 50021))

# Train model with new features

In [134]:
new_features1 = list(new_train_features1.columns)

cv_scores3 = train_and_predict(model=lr, X_train=X_train_sites_with_time1, y_train=y_train, 
                               X_test=X_test_sites_with_time1, site_feature_names=vectorizer.get_feature_names(),              
                               new_feature_names=new_features1, cv=tss, submission_file_name='subm3.csv')

CV scores [0.83797321 0.905022   0.8941699  0.97977834 0.8530089  0.97610007
 0.9117137  0.96060388 0.79322489 0.97403216]
CV mean: 0.9085627058953534, CV std: 0.0618296281433953


Weight?,Feature
+5.142,vk.com
+5.116,youwatch.org
+4.983,www.express.co.uk
+4.878,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.599,www.info-jeunes.net
+4.320,www.audienceinsights.net
+4.312,www.melty.fr
+4.059,fr.glee.wikia.com
+3.892,dub119.mail.live.com
+3.888,api.bing.com


New feature weights:
                feature      coef
0       part_of_day_day  1.100248
1   part_of_day_evening -2.442072
2   part_of_day_morning -2.554619
3         day_of_week_0  1.370424
4         day_of_week_1  0.404431
5         day_of_week_2 -2.273862
6         day_of_week_3 -1.038249
7         day_of_week_4  0.033988
8         day_of_week_5  0.078945
9         day_of_week_6 -2.472120
10              month_5 -0.782034
11              month_6 -0.616994
12              month_7 -0.056206
13              month_8 -0.972378
14              month_9  3.435460
15             month_10 -1.977375
16             month_11  0.402279
17             month_12 -0.137222
18            year_2014 -4.468006
19         full_session  0.704645
20           year_month  1.846200


# Search for the best hyperparameters of model

In [47]:
from sklearn.model_selection import RandomForestClassifier

C_list = np.logspace(-3, 1, 10)
param_grid_lr = {'C': C_list}

lr = LogisticRegression(max_iter=100, random_state=88)
grid_lr = GridSearchCV(lr, param_grid_lr, return_train_score=True, cv=tss, n_jobs=7, scoring = 'roc_auc', verbose=10)
grid_lr.fit(X_train_1, y_train)
print((grid_lr.best_params_, grid_lr.best_score_))

NameError: name 'GridSearchCV' is not defined

# Predict test target values with optimized hyperparameters

In [149]:
lr = LogisticRegression(max_iter=1000, C=1.29, random_state=88, solver='liblinear')
lr.fit(X_train_1, y_train)
prediction_2 = lr.predict_proba(X_test_1)[:,1]
prediction_2 = pd.Series(prediction_1)

# Write prediction to a file

In [150]:
write_submission_to_file('submission_2.csv', prediction_2, test_df) # 0.93891

# Search for a best Hyperparameters of Random Forest model

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# criterion_list = ['gini', 'entropy']
# min_samples_split_list = [2, 4, 6, 8, 10]
# class_weight_list = ['balanced', {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:100}, None]
# param_grid_rf = {'criterion': criterion_list, 'min_samples_split': min_samples_split_list, 
#                  'class_weight': class_weight_list}

# rf = RandomForestClassifier()
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

# Additional Hyperparameter optimization for Random Forest model

Increase number of estimators

In [None]:
# rf = RandomForestClassifier(class_weight = None, min_samples_split=10, n_estimators=1000, criterion='entropy', random_state=17)

# cv = cross_val_score(rf, X, y, scoring='roc_auc', cv=4, n_jobs=7, verbose=10)
# np.mean(cv)

Additional Grid Search for hyperparameter *n_estimators*=1000

In [None]:
# min_samples_split_list = [9, 10, 11]
# class_weight_list = ['balanced', None, {0:1, 1:75}]
# param_grid_rf = {'min_samples_split': min_samples_split_list, 'class_weight': class_weight_list}

# rf = RandomForestClassifier(n_estimators=1000, criterion='entropy')
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

# Predict test target with optimized Random Forest

# Predict test target values

In [9]:
lr.fit(X_train, y_train)
prediction = lr.predict_proba(X_test)[:,1]
prediction = pd.Series(prediction)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Write prediction to a file

In [10]:
def write_submission_to_file(filename, prediction, test_df):
    submission = pd.DataFrame()
    submission['session_id'] = test_df['session_id'].copy()
    submission['target'] = prediction
    submission.to_csv(filename, index=None)
    
write_submission_to_file('submission.csv', prediction, test_df)