# Read train and test dasets

In [35]:
import pandas as pd
pd.set_option('max_columns', 100)

train_df = pd.read_csv('train_sessions.csv')
test_df = pd.read_csv('test_sessions.csv')

# Analyze data types in train dataset

In [36]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 22 columns):
session_id    253561 non-null int64
site1         253561 non-null int64
time1         253561 non-null object
site2         250098 non-null float64
time2         250098 non-null object
site3         246919 non-null float64
time3         246919 non-null object
site4         244321 non-null float64
time4         244321 non-null object
site5         241829 non-null float64
time5         241829 non-null object
site6         239495 non-null float64
time6         239495 non-null object
site7         237297 non-null float64
time7         237297 non-null object
site8         235224 non-null float64
time8         235224 non-null object
site9         233084 non-null float64
time9         233084 non-null object
site10        231052 non-null float64
time10        231052 non-null object
target        253561 non-null int64
dtypes: float64(9), int64(3), object(10)
memory usage: 32.9+ MB


# Analyze data types in test dataset

In [37]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82797 entries, 0 to 82796
Data columns (total 21 columns):
session_id    82797 non-null int64
site1         82797 non-null int64
time1         82797 non-null object
site2         81308 non-null float64
time2         81308 non-null object
site3         80075 non-null float64
time3         80075 non-null object
site4         79182 non-null float64
time4         79182 non-null object
site5         78341 non-null float64
time5         78341 non-null object
site6         77566 non-null float64
time6         77566 non-null object
site7         76840 non-null float64
time7         76840 non-null object
site8         76151 non-null float64
time8         76151 non-null object
site9         75484 non-null float64
time9         75484 non-null object
site10        74806 non-null float64
time10        74806 non-null object
dtypes: float64(9), int64(2), object(10)
memory usage: 10.1+ MB


# Convert time columns to datetime

In [38]:
time_columns = ['time%s' % i for i in range(1, 11)]
train_df[time_columns] = train_df[time_columns].fillna(0)
test_df[time_columns] = test_df[time_columns].fillna(0)

for column in time_columns:
    train_df[column] = pd.to_datetime(train_df[column])
    test_df[column] = pd.to_datetime(test_df[column])

# Convert site data to bag of words

In [91]:
from scipy.sparse import csr_matrix

site_columns = ['site%s' % i for i in range(1, 11)]
train_df[site_columns] = train_df[site_columns].fillna(0).astype('int')
test_df[site_columns] = test_df[site_columns].fillna(0).astype('int')
full_df = pd.concat([train_df.drop('target', axis=1), test_df])
idx_split = train_df.shape[0]


full_sites_flatten = full_df[site_columns].values.flatten()
full_sites_sparse = csr_matrix(([1] * full_sites_flatten.shape[0],
                    full_sites_flatten,
                    range(0, full_sites_flatten.shape[0] + 10, 10)))[:, 1:]

# Create train dataset

In [104]:
X_train = full_sites_sparse[:idx_split,:]
X_test = full_sites_sparse[idx_split:,:]
X_train.shape, X_test.shape

((253561, 48371), (82797, 48371))

# Convert target column of train dataset to int

In [105]:
y_train = train_df['target'].astype(int)

# Train a Logistic Regression model

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

lr = LogisticRegression(max_iter=100, C=1, random_state=88)
cv = cross_val_score(lr, X_train, y_train, cv=5, n_jobs=7, scoring='roc_auc')
np.mean(cv), np.std(cv)

(0.9637201773704055, 0.0012643228418905213)

# Predict test target values

In [106]:
lr.fit(X_train, y_train)
prediction = lr.predict_proba(X_test)[:,1]
prediction = pd.Series(prediction)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Write prediction to a file

In [109]:
def write_submission_to_file(filename, prediction, test_df):
    submission = pd.DataFrame()
    submission['session_id'] = test_df['session_id'].copy()
    submission['target'] = prediction
    submission.to_csv(filename, index=None)
    
write_submission_to_file('submission.csv', prediction, test_df)

We have got a 0.90744 score on Kaggle. Let's impove it by adding time features.

# Add new features

* hour of session start
* part of day: morning, noon, evening, night
* number of visited sites (not unique) in session 
* length of session in seconds
* average time of site visit in seconds

# Add date and hour of session start

In [115]:
from sklearn.preprocessing import scale

train_df['start_month'] = scale(train_df['time1'].map(lambda x: 100*x.year + x.month).astype(int))
test_df['start_month'] = scale(test_df['time1'].map(lambda x: 100*x.year + x.month).astype(int))

train_df['start_hour'] = train_df['time1'].dt.hour.astype(int)
test_df['start_hour'] = test_df['time1'].dt.hour.astype(int)

# Add part of day

In [120]:
def determine_part_of_day(value):
    if value < 11:
        return 1
    else:
        return 0
    
train_df['part_of_day'] = train_df['start_hour'].apply(determine_part_of_day)
test_df['part_of_day'] = test_df['start_hour'].apply(determine_part_of_day)

# Determine last visit time for each session

In [117]:
null_time = pd.to_datetime(0)

def determine_last_visit_time(row):
    last_visit = null_time
    for value in row:
        if value == null_time:
            return last_visit
        else:
            last_visit = value
    return last_visit

train_df['last_visit'] = train_df[time_columns].apply(determine_last_visit_time, axis=1)
test_df['last_visit'] = test_df[time_columns].apply(determine_last_visit_time, axis=1)

# Add length of session in seconds

In [124]:
train_df['session_length'] = scale((train_df['last_visit'] - train_df['time1']).dt.total_seconds().astype(int))
test_df['session_length'] = scale((test_df['last_visit'] - test_df['time1']).dt.total_seconds().astype(int))

# Add num of visited sites

In [125]:
import numpy as np

train_df['num_of_sites'] = train_df[site_columns].apply(np.count_nonzero, axis=1)
test_df['num_of_sites'] = test_df[site_columns].apply(np.count_nonzero, axis=1)

# Add new feature columns to  train and test dataset

In [133]:
from scipy.sparse import hstack
from sklearn.preprocessing import scale

train_df['start_hour'] = pd.Categorical(train_df['start_hour'])
train_dummy = pd.get_dummies(train_df[['start_hour']])
train_dummy = pd.concat([train_dummy, train_df[['session_length', 'num_of_sites', 
                                                'part_of_day', 'start_month']]], axis=1)

test_df['start_hour'] = pd.Categorical(test_df['start_hour'])
test_dummy = pd.get_dummies(test_df[['start_hour']])
test_dummy = pd.concat([test_dummy, test_df[['session_length', 'num_of_sites', 
                                             'part_of_day', 'start_month']]], axis=1)

# Create new train and test datasets with additional features

In [135]:
X_train_1 = hstack([X_train, train_dummy])
X_test_1 = hstack([X_test, test_dummy])
X_train_1.shape, X_test_1.shape

((253561, 48392), (82797, 48392))

# Train Logistic Regression model on new data

In [141]:
from sklearn.model_selection import TimeSeriesSplit

lr = LogisticRegression(max_iter=1000, C=1, random_state=88)
tss = TimeSeriesSplit(n_splits=5)
cv = cross_val_score(lr, X_train_1, y_train, cv=tss, n_jobs=7, scoring='roc_auc')
np.mean(cv), np.std(cv)

(0.9817056455557326, 0.0044923776675116845)

# Predict test target values

In [140]:
lr = LogisticRegression(max_iter=1000, C=1, random_state=88)
lr.fit(X_train_1, y_train)
prediction_1 = lr.predict_proba(X_test_1)[:,1]
prediction_1 = pd.Series(prediction_1)

# Write prediction to a file

In [142]:
write_submission_to_file('submission_1.csv', prediction_1, test_df) # 0.93891

# Search for the best hyperparameters of model

In [146]:
from sklearn.ensemble import RandomForestClassifier

C_list = np.logspace(-3, 1, 10)
param_grid_lr = {'C': C_list}

lr = LogisticRegression(max_iter=1000, random_state=88, solver='liblinear')
grid_lr = GridSearchCV(lr, param_grid_lr, return_train_score=True, cv=tss, n_jobs=7, scoring = 'roc_auc', verbose=10)
grid_lr.fit(X_train_1, y_train)
print((grid_lr.best_params_, grid_lr.best_score_))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    2.3s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:    6.0s
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:   16.4s
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:   39.5s
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  1.3min
[Parallel(n_jobs=7)]: Done  43 out of  50 | elapsed:  2.4min remaining:   22.9s
[Parallel(n_jobs=7)]: Done  50 out of  50 | elapsed:  3.5min finished


({'C': 1.2915496650148828}, 0.9819041212210747)


# Predict test target values with optimized hyperparameters

In [149]:
lr = LogisticRegression(max_iter=1000, C=1.29, random_state=88, solver='liblinear')
lr.fit(X_train_1, y_train)
prediction_2 = lr.predict_proba(X_test_1)[:,1]
prediction_2 = pd.Series(prediction_1)

# Write prediction to a file

In [150]:
write_submission_to_file('submission_2.csv', prediction_2, test_df) # 0.93891

# Search for a best Hyperparameters of Random Forest model

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# criterion_list = ['gini', 'entropy']
# min_samples_split_list = [2, 4, 6, 8, 10]
# class_weight_list = ['balanced', {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:100}, None]
# param_grid_rf = {'criterion': criterion_list, 'min_samples_split': min_samples_split_list, 
#                  'class_weight': class_weight_list}

# rf = RandomForestClassifier()
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

# Additional Hyperparameter optimization for Random Forest model

Increase number of estimators

In [None]:
# rf = RandomForestClassifier(class_weight = None, min_samples_split=10, n_estimators=1000, criterion='entropy', random_state=17)

# cv = cross_val_score(rf, X, y, scoring='roc_auc', cv=4, n_jobs=7, verbose=10)
# np.mean(cv)

Additional Grid Search for hyperparameter *n_estimators*=1000

In [None]:
# min_samples_split_list = [9, 10, 11]
# class_weight_list = ['balanced', None, {0:1, 1:75}]
# param_grid_rf = {'min_samples_split': min_samples_split_list, 'class_weight': class_weight_list}

# rf = RandomForestClassifier(n_estimators=1000, criterion='entropy')
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

# Predict test target with optimized Random Forest