# Read train and test dasets

In [137]:
import pandas as pd
pd.set_option('max_columns', 100)

train_df = pd.read_csv('train_sessions.csv')
test_df = pd.read_csv('test_sessions.csv')

# Analyze data types in train dataset

In [138]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 22 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   session_id  253561 non-null  int64  
 1   site1       253561 non-null  int64  
 2   time1       253561 non-null  object 
 3   site2       250098 non-null  float64
 4   time2       250098 non-null  object 
 5   site3       246919 non-null  float64
 6   time3       246919 non-null  object 
 7   site4       244321 non-null  float64
 8   time4       244321 non-null  object 
 9   site5       241829 non-null  float64
 10  time5       241829 non-null  object 
 11  site6       239495 non-null  float64
 12  time6       239495 non-null  object 
 13  site7       237297 non-null  float64
 14  time7       237297 non-null  object 
 15  site8       235224 non-null  float64
 16  time8       235224 non-null  object 
 17  site9       233084 non-null  float64
 18  time9       233084 non-null  object 
 19  si

# Analyze data types in test dataset

In [139]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82797 entries, 0 to 82796
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   session_id  82797 non-null  int64  
 1   site1       82797 non-null  int64  
 2   time1       82797 non-null  object 
 3   site2       81308 non-null  float64
 4   time2       81308 non-null  object 
 5   site3       80075 non-null  float64
 6   time3       80075 non-null  object 
 7   site4       79182 non-null  float64
 8   time4       79182 non-null  object 
 9   site5       78341 non-null  float64
 10  time5       78341 non-null  object 
 11  site6       77566 non-null  float64
 12  time6       77566 non-null  object 
 13  site7       76840 non-null  float64
 14  time7       76840 non-null  object 
 15  site8       76151 non-null  float64
 16  time8       76151 non-null  object 
 17  site9       75484 non-null  float64
 18  time9       75484 non-null  object 
 19  site10      74806 non-nul

# Convert time columns to datetime

In [140]:
time_columns = ['time%s' % i for i in range(1, 11)]
train_df[time_columns] = train_df[time_columns].fillna(0)
test_df[time_columns] = test_df[time_columns].fillna(0)

for column in time_columns:
    train_df[column] = pd.to_datetime(train_df[column])
    test_df[column] = pd.to_datetime(test_df[column])

# Convert site data to bag of words

Read site data to temporary txt files

In [141]:
site_columns = ['site%s' % i for i in range(1, 11)]
train_df[site_columns].fillna(0).astype(int).to_csv('train_sessions.txt', sep=' ', index=None, header=None)
test_df[site_columns].fillna(0).astype(int).to_csv('test_sessions.txt', sep=' ', index=None, header=None)

Convert data from files to bag of words

In [143]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

with open('train_sessions.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
    
X_train.shape, X_test.shape    

((253561, 41592), (82797, 41592))

# Convert target column of train dataset to int

In [144]:
y_train = train['target'].astype(int)

# Train a Logistic Regression model

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

lr = LogisticRegression(max_iter=300, C=1, random_state=88)
cv = cross_val_score(lr, X_train, y_train, cv=5, n_jobs=7, scoring='roc_auc')
np.mean(cv), np.std(cv)

(0.9626805505012536, 0.0017373139537141871)

# Predict test target values

In [146]:
lr.fit(X_train, y_train)
prediction = lr.predict_proba(X_test)[:,1]
prediction = pd.Series(prediction)

# Write prediction to a file

In [147]:
def write_submission_to_file(prediction, test_df):
    submission = pd.DataFrame()
    submission['session_id'] = test_df['session_id'].copy()
    submission['target'] = prediction
    submission.to_csv('submission.csv', index=None)
    
write_submission_to_file(prediction, test_df)

We have got a 0.90744 score on Kaggle. Let's impove it by adding time features.

# Add new features

* hour of session start
* part of day: morning, noon, evening, night
* number of visited sites (not unique) in session 
* length of session in seconds
* average time of site visit in seconds

# Add hour of session start

In [148]:
train_df['start_hour'] = train_df['time1'].dt.hour.astype(int)
test_df['start_hour'] = test_df['time1'].dt.hour.astype(int)

# Add part of day

In [149]:
def determine_part_of_day(value):
    if value < 6:
        return 1
    elif value < 12:
        return 2
    elif value < 18:
        return 3
    else:
        return 4
    
train_df['part_of_day'] = train_df['start_hour'].apply(determine_part_of_day)
test_df['part_of_day'] = test_df['start_hour'].apply(determine_part_of_day)

# Determine last visit time for each session

In [165]:
null_time = pd.to_datetime(0)

def determine_last_visit_time(row):
    last_visit = null_time
    for value in row:
        if value == null_time:
            return last_visit
        else:
            last_visit = value
    return last_visit

train_df['last_visit'] = train_df[time_columns].apply(determine_last_visit_time, axis=1)
test_df['last_visit'] = test_df[time_columns].apply(determine_last_visit_time, axis=1)

# Add length of session in seconds

In [171]:
train_df['session_length'] = (train_df['last_visit'] - train_df['time1']).dt.total_seconds().astype(int)
test_df['session_length'] = (test_df['last_visit'] - test_df['time1']).dt.total_seconds().astype(int)

# Add num of visited sites

In [160]:
train_df[site_columns] = train_df[site_columns].fillna(0)
test_df[site_columns] = test_df[site_columns].fillna(0)

train_df['num_of_sites'] = train_df[site_columns].apply(np.count_nonzero, axis=1)
test_df['num_of_sites'] = test_df[site_columns].apply(np.count_nonzero, axis=1)

# Add average normalized session time

In [182]:
train_df['avg_session_time'] = train_df['session_length']/train_df['num_of_sites']
test_df['avg_session_time'] = test_df['session_length']/test_df['num_of_sites']

train_df['avg_session_time'] = (train_df['avg_session_time'] - 
                                train_df['avg_session_time'].min())/train_df['avg_session_time'].max()
test_df['avg_session_time'] = (test_df['avg_session_time'] - 
                               test_df['avg_session_time'].min())/train_df['avg_session_time'].max()

# Add new feature columns to  train and test dataset

# Search for a best Hyperparameters of Logistic Regression model

In [96]:
from sklearn.model_selection import GridSearchCV

lr_opt = LogisticRegression(max_iter=300, random_state=17)
solver_list = ['newton-cg', 'lbfgs', 'liblinear', 'saga']
C_list = np.logspace(-4, 3, 8)
class_weight_list = ['balanced', None]
param_grid_lr = {'C': C_list, 'solver': solver_list, 'class_weight': class_weight_list}

grid_lr = GridSearchCV(lr_opt, param_grid_lr, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
grid_lr.fit(X_train, y_train)
print((grid_lr.best_params_, grid_lr.best_score_))

Fitting 4 folds for each of 64 candidates, totalling 256 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    2.6s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:    3.6s
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:    6.8s
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:   45.2s
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   49.2s
[Parallel(n_jobs=7)]: Done  47 tasks      | elapsed:   54.8s
[Parallel(n_jobs=7)]: Done  58 tasks      | elapsed:  1.5min
[Parallel(n_jobs=7)]: Done  71 tasks      | elapsed:  1.7min
[Parallel(n_jobs=7)]: Done  84 tasks      | elapsed:  2.1min
[Parallel(n_jobs=7)]: Done  99 tasks      | elapsed:  2.8min
[Parallel(n_jobs=7)]: Done 114 tasks      | elapsed:  3.2min
[Parallel(n_jobs=7)]: Done 131 tasks      | elapsed:  4.0min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:  4.6min
[Parallel(n_jobs=7)]: Done 167 tasks      | elapsed:  5.8min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:  6.8min
[Parallel(

({'C': 1.0, 'class_weight': None, 'solver': 'saga'}, 0.9617833375130993)




# Predict test target values with Grid Search and write submission to file

In [None]:
grid_lr.fit(X_train, y_train)
grid_lr_prediction = grid_lr.predict_proba(X_test)[:,1]
grid_lr_prediction = pd.Series(grid_lr_prediction)

write_submission_to_file(grid_lr_prediction, test_df)

# Search for a best Hyperparameters of Random Forest model

In [41]:
from sklearn.ensemble import RandomForestClassifier

criterion_list = ['gini', 'entropy']
min_samples_split_list = [2, 4, 6, 8, 10]
class_weight_list = ['balanced', {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:100}, None]
param_grid_rf = {'criterion': criterion_list, 'min_samples_split': min_samples_split_list, 
                 'class_weight': class_weight_list}

rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
grid_rf.fit(X, y)
print((grid_rf.best_params_, grid_rf.best_score_))

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:   54.6s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:  1.8min
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:  2.7min
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  3.8min
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  5.7min
[Parallel(n_jobs=7)]: Done  47 tasks      | elapsed:  6.7min
[Parallel(n_jobs=7)]: Done  58 tasks      | elapsed:  8.5min
[Parallel(n_jobs=7)]: Done  71 tasks      | elapsed: 10.4min
[Parallel(n_jobs=7)]: Done  84 tasks      | elapsed: 11.6min
[Parallel(n_jobs=7)]: Done  99 tasks      | elapsed: 14.1min
[Parallel(n_jobs=7)]: Done 114 tasks      | elapsed: 16.2min
[Parallel(n_jobs=7)]: Done 131 tasks      | elapsed: 18.3min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed: 20.9min
[Parallel(n_jobs=7)]: Done 167 tasks      | elapsed: 23.3min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 26.3min
[Parallel(

({'class_weight': None, 'criterion': 'entropy', 'min_samples_split': 10}, 0.966549918648429)


# Additional Hyperparameter optimization for Random Forest model

Increase number of estimators

In [48]:
rf = RandomForestClassifier(class_weight = None, min_samples_split=10, n_estimators=1000, criterion='entropy', random_state=17)

cv = cross_val_score(rf, X, y, scoring='roc_auc', cv=4, n_jobs=7, verbose=10)
np.mean(cv)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   1 tasks      | elapsed:  7.1min
[Parallel(n_jobs=7)]: Done   2 out of   4 | elapsed:  7.1min remaining:  7.1min
[Parallel(n_jobs=7)]: Done   4 out of   4 | elapsed:  7.3min finished
[Parallel(n_jobs=7)]: Done   4 out of   4 | elapsed:  7.3min remaining:    0.0s


0.9764965945993147

Additional Grid Search for hyperparameter *n_estimators*=1000

In [49]:
min_samples_split_list = [9, 10, 11]
class_weight_list = ['balanced', None, {0:1, 1:75}]
param_grid_rf = {'min_samples_split': min_samples_split_list, 'class_weight': class_weight_list}

rf = RandomForestClassifier(n_estimators=1000, criterion='entropy')
grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=7, scoring = 'roc_auc', verbose=10)
grid_rf.fit(X, y)
print((grid_rf.best_params_, grid_rf.best_score_))

Fitting 4 folds for each of 9 candidates, totalling 36 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed: 10.4min
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed: 20.7min
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed: 30.9min
[Parallel(n_jobs=7)]: Done  27 out of  36 | elapsed: 41.2min remaining: 13.7min
[Parallel(n_jobs=7)]: Done  31 out of  36 | elapsed: 51.2min remaining:  8.3min
[Parallel(n_jobs=7)]: Done  36 out of  36 | elapsed: 56.6min finished


({'class_weight': None, 'min_samples_split': 9}, 0.9771026008513681)


# Predict test target with optimized Random Forest

In [73]:
rf = RandomForestClassifier(n_estimators=100, class_weight=None, n_jobs=7)
rf.fit(X, y)
prediction = rf.predict(train[features])
submission = pd.DataFrame()
submission['session_id'] = train['session_id'].copy()
submission['target'] = prediction
submission['target'].value_counts()

0    251266
1      2295
Name: target, dtype: int64

In [74]:
y.value_counts()

0    251264
1      2297
Name: target, dtype: int64

In [76]:
train.loc[train['target'] == 1, features]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,delta0,delta1,delta2,delta3,delta4,delta5,delta6,delta7,delta8,delta9
152,5397,5395,22,5396,5402,5392,22,35,33,338,1385126629,0,1,1,1,1,2,5,5,5
286,10941,9783,9786,27339,27338,29,27339,9783,9785,10941,1395682454,0,0,0,1,1,1,1,1,1
429,37,270,32,33,35,704,29,33,12623,704,1392295609,85,88,89,89,94,96,119,119,119
643,39,23,677,22,39,23,21,678,22,678,1392655323,3,5,9,9,9,9,10,12,14
681,77,879,80,879,80,879,77,879,879,80,1379002915,1,3,3,9,11,15,18,19,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252789,80,879,879,879,80,80,76,80,80,80,1385729486,6,10,21,30,39,59,61,79,87
252849,2271,37,17283,29,29,30,17283,143,33,18876,1385730150,28,50,50,51,51,51,51,51,51
253139,4694,27332,2409,27332,2401,27332,27332,4696,27332,2401,1395679970,2,2,3,3,4,5,7,7,9
253442,22,617,76,80,76,82,879,1440,1307,77,1397496657,0,305,309,310,310,311,311,312,312


# Save predictions to file

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
pipe_logit = Pipeline([('scaler', StandardScaler()), ('logit', LogisticRegression(max_iter=100, C=0.1, 
                                                                                  class_weight='ba', 
                                                                                  solver='saga', n_jobs=7))])
pipe_logit.fit(X, y)
prediction = pipe_logit.predict(test[features])
submission = pd.DataFrame()
submission['session_id'] = test['session_id'].copy()
submission['target'] = prediction
submission['target'].value_counts()