# Read train and test dasets

In [1]:
import pandas as pd
pd.set_option('max_columns', 100)

train_df = pd.read_csv('train_sessions.csv')
test_df = pd.read_csv('test_sessions.csv')

# Analyze data types in train dataset

In [2]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 22 columns):
session_id    253561 non-null int64
site1         253561 non-null int64
time1         253561 non-null object
site2         250098 non-null float64
time2         250098 non-null object
site3         246919 non-null float64
time3         246919 non-null object
site4         244321 non-null float64
time4         244321 non-null object
site5         241829 non-null float64
time5         241829 non-null object
site6         239495 non-null float64
time6         239495 non-null object
site7         237297 non-null float64
time7         237297 non-null object
site8         235224 non-null float64
time8         235224 non-null object
site9         233084 non-null float64
time9         233084 non-null object
site10        231052 non-null float64
time10        231052 non-null object
target        253561 non-null int64
dtypes: float64(9), int64(3), object(10)
memory usage: 32.9+ MB


# Analyze data types in test dataset

In [3]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82797 entries, 0 to 82796
Data columns (total 21 columns):
session_id    82797 non-null int64
site1         82797 non-null int64
time1         82797 non-null object
site2         81308 non-null float64
time2         81308 non-null object
site3         80075 non-null float64
time3         80075 non-null object
site4         79182 non-null float64
time4         79182 non-null object
site5         78341 non-null float64
time5         78341 non-null object
site6         77566 non-null float64
time6         77566 non-null object
site7         76840 non-null float64
time7         76840 non-null object
site8         76151 non-null float64
time8         76151 non-null object
site9         75484 non-null float64
time9         75484 non-null object
site10        74806 non-null float64
time10        74806 non-null object
dtypes: float64(9), int64(2), object(10)
memory usage: 10.1+ MB


# Convert time columns to datetime

In [4]:
time_columns = ['time%s' % i for i in range(1, 11)]
train_df[time_columns] = train_df[time_columns].fillna(0)
test_df[time_columns] = test_df[time_columns].fillna(0)

for column in time_columns:
    train_df[column] = pd.to_datetime(train_df[column])
    test_df[column] = pd.to_datetime(test_df[column])

# Convert site data to bag of words

Read site data to temporary txt files

In [5]:
site_columns = ['site%s' % i for i in range(1, 11)]
train_df[site_columns].fillna(0).astype(int).to_csv('train_sessions.txt', sep=' ', index=None, header=None)
test_df[site_columns].fillna(0).astype(int).to_csv('test_sessions.txt', sep=' ', index=None, header=None)

Convert data from files to bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

with open('train_sessions.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
    
X_train.shape, X_test.shape    

((253561, 41592), (82797, 41592))

# Convert target column of train dataset to int

In [7]:
y_train = train_df['target'].astype(int)

# Train a Logistic Regression model

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

lr = LogisticRegression(max_iter=300, C=1, random_state=88)
cv = cross_val_score(lr, X_train, y_train, cv=5, n_jobs=7, scoring='roc_auc')
np.mean(cv), np.std(cv)

(0.9626804983503068, 0.0017367892709395987)

# Predict test target values

In [9]:
lr.fit(X_train, y_train)
prediction = lr.predict_proba(X_test)[:,1]
prediction = pd.Series(prediction)

# Write prediction to a file

In [55]:
def write_submission_to_file(filename, prediction, test_df):
    submission = pd.DataFrame()
    submission['session_id'] = test_df['session_id'].copy()
    submission['target'] = prediction
    submission.to_csv(filename, index=None)
    
write_submission_to_file('submission.csv', prediction, test_df)

We have got a 0.90744 score on Kaggle. Let's impove it by adding time features.

# Add new features

* hour of session start
* part of day: morning, noon, evening, night
* number of visited sites (not unique) in session 
* length of session in seconds
* average time of site visit in seconds

# Add hour of session start

In [45]:
train_df['start_hour'] = train_df['time1'].dt.hour.astype(int)
test_df['start_hour'] = test_df['time1'].dt.hour.astype(int)

# Add part of day

In [46]:
def determine_part_of_day(value):
    if value < 6:
        return 1
    elif value < 12:
        return 2
    elif value < 18:
        return 3
    else:
        return 4
    
train_df['part_of_day'] = train_df['start_hour'].apply(determine_part_of_day)
test_df['part_of_day'] = test_df['start_hour'].apply(determine_part_of_day)

# Determine last visit time for each session

In [13]:
null_time = pd.to_datetime(0)

def determine_last_visit_time(row):
    last_visit = null_time
    for value in row:
        if value == null_time:
            return last_visit
        else:
            last_visit = value
    return last_visit

train_df['last_visit'] = train_df[time_columns].apply(determine_last_visit_time, axis=1)
test_df['last_visit'] = test_df[time_columns].apply(determine_last_visit_time, axis=1)

# Add length of session in seconds

In [91]:
train_df['session_length'] = (train_df['last_visit'] - train_df['time1']).dt.total_seconds().astype(int)
test_df['session_length'] = (test_df['last_visit'] - test_df['time1']).dt.total_seconds().astype(int)

# Add num of visited sites

In [92]:
train_df[site_columns] = train_df[site_columns].fillna(0)
test_df[site_columns] = test_df[site_columns].fillna(0)

train_df['num_of_sites'] = train_df[site_columns].apply(np.count_nonzero, axis=1)
test_df['num_of_sites'] = test_df[site_columns].apply(np.count_nonzero, axis=1)

# Add average normalized session time

In [16]:
# train_df['avg_session_time'] = train_df['session_length']/train_df['num_of_sites']
# test_df['avg_session_time'] = test_df['session_length']/test_df['num_of_sites']

# train_df['avg_session_time'] = (train_df['avg_session_time'] - 
#                                 train_df['avg_session_time'].min())/train_df['avg_session_time'].max()
# test_df['avg_session_time'] = (test_df['avg_session_time'] - 
#                                test_df['avg_session_time'].min())/train_df['avg_session_time'].max()

# Add new feature columns to  train and test dataset

In [61]:
from scipy.sparse import hstack
from sklearn.preprocessing import scale

train_df['start_hour'] = pd.Categorical(train_df['start_hour'])
train_df['part_of_day'] = pd.Categorical(train_df['part_of_day'])
train_dummy = pd.get_dummies(train_df[['start_hour', 'part_of_day']])
train_dummy = pd.concat([train_dummy, train_df[['session_length', 'num_of_sites']]], axis=1)

test_df['start_hour'] = pd.Categorical(test_df['start_hour'])
test_df['part_of_day'] = pd.Categorical(test_df['part_of_day'])
test_dummy = pd.get_dummies(test_df[['start_hour', 'part_of_day']])
test_dummy = pd.concat([test_dummy, test_df[['session_length', 'num_of_sites']]], axis=1)

# Create new train and test datasets with additional features

In [62]:
X_train_1 = hstack([X_train, train_dummy])
X_test_1 = hstack([X_test, test_dummy])
X_train_1.shape, X_test_1.shape

((253561, 41614), (82797, 41614))

# Train Logistic Regression model on new data

In [67]:
from sklearn.model_selection import TimeSeriesSplit

lr = LogisticRegression(max_iter=100, C=1, random_state=88)
tss = TimeSeriesSplit(n_splits=10)
cv = cross_val_score(lr, X_train_1, y_train, cv=tss, n_jobs=7, scoring='roc_auc')
np.mean(cv), np.std(cv)

(0.9613290502941677, 0.006141368933499719)

# Predict test target values

In [66]:
lr = LogisticRegression(max_iter=2000, C=1, random_state=88)
lr.fit(X_train_1, y_train)
prediction_1 = lr.predict_proba(X_test_1)[:,1]
prediction_1 = pd.Series(prediction_1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Write prediction to a file

In [64]:
write_submission_to_file('submission_1.csv', prediction_1, test_df) # 0.93849

# Normalize features, train model again

In [109]:
from scipy.sparse import hstack
from sklearn.preprocessing import scale

train_df['session_length_scaled'] = scale(train_df['session_length'])
train_df['num_of_sites_scaled'] = scale(train_df['num_of_sites'])
train_df['start_hour'] = pd.Categorical(train_df['start_hour'])
train_df['part_of_day'] = pd.Categorical(train_df['part_of_day'])
train_dummy = pd.get_dummies(train_df[['start_hour', 'part_of_day']])
train_dummy = pd.concat([train_dummy, train_df[['session_length_scaled', 'num_of_sites_scaled']]], axis=1)

test_df['session_length_scaled'] = scale(test_df['session_length'])
test_df['num_of_sites_scaled'] = scale(test_df['num_of_sites'])
test_df['start_hour'] = pd.Categorical(test_df['start_hour'])
test_df['part_of_day'] = pd.Categorical(test_df['part_of_day'])
test_dummy = pd.get_dummies(test_df[['start_hour', 'part_of_day']])
test_dummy = pd.concat([test_dummy, test_df[['session_length_scaled', 'num_of_sites_scaled']]], axis=1)

# Create new train and test datasets with additional features

In [110]:
X_train_2 = hstack([X_train, train_dummy])
X_test_2 = hstack([X_test, test_dummy])
X_train_2.shape, X_test_2.shape

((253561, 41614), (82797, 41614))

# Train Logistic Regression model on new data

In [111]:
from sklearn.model_selection import TimeSeriesSplit

lr = LogisticRegression(max_iter=1000, C=1, random_state=88)
tss = TimeSeriesSplit(n_splits=10)
cv = cross_val_score(lr, X_train_2, y_train, cv=tss, n_jobs=7, scoring='roc_auc')
np.mean(cv), np.std(cv)

(0.979342650823518, 0.009533363931692906)

# Predict test target values

In [112]:
lr = LogisticRegression(max_iter=1000, C=13.4, random_state=88)
lr.fit(X_train_2, y_train)
prediction_3 = lr.predict_proba(X_test_2)[:,1]
prediction_3 = pd.Series(prediction_3)

# Write prediction to a file

In [113]:
write_submission_to_file('submission_3.csv', prediction_3, test_df) #

# Search for a best hyperparameters of Logistic Regression model

In [None]:
# from sklearn.model_selection import GridSearchCV

# lr = LogisticRegression(max_iter=1000, random_state=88)
# tss = TimeSeriesSplit(n_splits=5)
# C_list = np.logspace(-4, 3, 16)
# class_weight_list = ['balanced', None, {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:100}]
# param_grid_lr = {'C': C_list, 'solver': solver_list, 'class_weight': class_weight_list}

# grid_lr = GridSearchCV(lr, param_grid_lr, return_train_score=True, cv=5, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_lr.fit(X_train_2, y_train)
# print((grid_lr.best_params_, grid_lr.best_score_))

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    4.3s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:    9.2s
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:   19.8s
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:   27.3s
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  1.5min
[Parallel(n_jobs=7)]: Done  47 tasks      | elapsed:  1.8min
[Parallel(n_jobs=7)]: Done  58 tasks      | elapsed:  3.0min
[Parallel(n_jobs=7)]: Done  71 tasks      | elapsed:  3.2min
[Parallel(n_jobs=7)]: Done  84 tasks      | elapsed:  3.9min
[Parallel(n_jobs=7)]: Done  99 tasks      | elapsed:  4.7min
[Parallel(n_jobs=7)]: Done 114 tasks      | elapsed:  5.7min
[Parallel(n_jobs=7)]: Done 131 tasks      | elapsed:  6.2min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:  7.3min
[Parallel(n_jobs=7)]: Done 167 tasks      | elapsed:  8.2min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:  9.3min
[Parallel(

# Search for a best Hyperparameters of Random Forest model

In [41]:
# from sklearn.ensemble import RandomForestClassifier

# criterion_list = ['gini', 'entropy']
# min_samples_split_list = [2, 4, 6, 8, 10]
# class_weight_list = ['balanced', {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:100}, None]
# param_grid_rf = {'criterion': criterion_list, 'min_samples_split': min_samples_split_list, 
#                  'class_weight': class_weight_list}

# rf = RandomForestClassifier()
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:   54.6s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:  1.8min
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:  2.7min
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  3.8min
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  5.7min
[Parallel(n_jobs=7)]: Done  47 tasks      | elapsed:  6.7min
[Parallel(n_jobs=7)]: Done  58 tasks      | elapsed:  8.5min
[Parallel(n_jobs=7)]: Done  71 tasks      | elapsed: 10.4min
[Parallel(n_jobs=7)]: Done  84 tasks      | elapsed: 11.6min
[Parallel(n_jobs=7)]: Done  99 tasks      | elapsed: 14.1min
[Parallel(n_jobs=7)]: Done 114 tasks      | elapsed: 16.2min
[Parallel(n_jobs=7)]: Done 131 tasks      | elapsed: 18.3min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed: 20.9min
[Parallel(n_jobs=7)]: Done 167 tasks      | elapsed: 23.3min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 26.3min
[Parallel(

({'class_weight': None, 'criterion': 'entropy', 'min_samples_split': 10}, 0.966549918648429)


# Additional Hyperparameter optimization for Random Forest model

Increase number of estimators

In [48]:
# rf = RandomForestClassifier(class_weight = None, min_samples_split=10, n_estimators=1000, criterion='entropy', random_state=17)

# cv = cross_val_score(rf, X, y, scoring='roc_auc', cv=4, n_jobs=7, verbose=10)
# np.mean(cv)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   1 tasks      | elapsed:  7.1min
[Parallel(n_jobs=7)]: Done   2 out of   4 | elapsed:  7.1min remaining:  7.1min
[Parallel(n_jobs=7)]: Done   4 out of   4 | elapsed:  7.3min finished
[Parallel(n_jobs=7)]: Done   4 out of   4 | elapsed:  7.3min remaining:    0.0s


0.9764965945993147

Additional Grid Search for hyperparameter *n_estimators*=1000

In [49]:
# min_samples_split_list = [9, 10, 11]
# class_weight_list = ['balanced', None, {0:1, 1:75}]
# param_grid_rf = {'min_samples_split': min_samples_split_list, 'class_weight': class_weight_list}

# rf = RandomForestClassifier(n_estimators=1000, criterion='entropy')
# grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
# grid_rf.fit(X, y)
# print((grid_rf.best_params_, grid_rf.best_score_))

Fitting 4 folds for each of 9 candidates, totalling 36 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed: 10.4min
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed: 20.7min
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed: 30.9min
[Parallel(n_jobs=7)]: Done  27 out of  36 | elapsed: 41.2min remaining: 13.7min
[Parallel(n_jobs=7)]: Done  31 out of  36 | elapsed: 51.2min remaining:  8.3min
[Parallel(n_jobs=7)]: Done  36 out of  36 | elapsed: 56.6min finished


({'class_weight': None, 'min_samples_split': 9}, 0.9771026008513681)


# Predict test target with optimized Random Forest