In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv('data/output/1/'+out_file, index_label=index_label)

In [3]:
site_columns = ['site%s' % i for i in range(1, 11)]
time_columns = ['time%s' % i for i in range(1, 11)]

In [4]:
train_df = pd.read_csv('data/input/raw/train_sessions.csv', index_col='session_id', parse_dates=time_columns)
test_df = pd.read_csv('data/input/raw/test_sessions.csv', index_col='session_id', parse_dates=time_columns)

train_df = train_df.sort_values(by='time1')

train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [5]:
train_df[site_columns] = train_df[site_columns].fillna(0).astype('int')
test_df[site_columns] = test_df[site_columns].fillna(0).astype('int')

In [6]:
train_df[site_columns].to_csv('data/input/processed/train_sessions_text.txt', sep=' ', index=None, header=None)
test_df[site_columns].to_csv('data/input/processed/test_sessions_text.txt', sep=' ', index=None, header=None)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 21669 to 204762
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   site1   253561 non-null  int64         
 1   time1   253561 non-null  datetime64[ns]
 2   site2   253561 non-null  int64         
 3   time2   250098 non-null  datetime64[ns]
 4   site3   253561 non-null  int64         
 5   time3   246919 non-null  datetime64[ns]
 6   site4   253561 non-null  int64         
 7   time4   244321 non-null  datetime64[ns]
 8   site5   253561 non-null  int64         
 9   time5   241829 non-null  datetime64[ns]
 10  site6   253561 non-null  int64         
 11  time6   239495 non-null  datetime64[ns]
 12  site7   253561 non-null  int64         
 13  time7   237297 non-null  datetime64[ns]
 14  site8   253561 non-null  int64         
 15  time8   235224 non-null  datetime64[ns]
 16  site9   253561 non-null  int64         
 17  time9   233084 non-null  

In [8]:
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('data/input/processed/train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('data/input/processed/test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

((253561, 50000), (82797, 50000))

In [9]:
y_train = train_df['target'].astype('int').values

In [23]:
time_split = TimeSeriesSplit(n_splits=10)
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1, verbose=2)
cv_scores, cv_scores.mean()

In [15]:
logit.fit(X_train, y_train)
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288

In [16]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [17]:
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)
X_train_new.shape, X_test_new.shape

In [20]:
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1, verbose=2)
cv_scores, cv_scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[CV] END ................................ score: (test=0.877) total time=   3.4s
[CV] END ................................ score: (test=0.751) total time=   6.3s
[CV] END ................................ score: (test=0.931) total time=  12.3s


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   12.4s remaining:   29.0s


[CV] END ................................ score: (test=0.979) total time=  19.5s
[CV] END ................................ score: (test=0.904) total time=  27.6s
[CV] END ................................ score: (test=0.938) total time=  32.4s
[CV] END ................................ score: (test=0.927) total time=  38.3s


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   38.6s remaining:   16.5s


[CV] END ................................ score: (test=0.962) total time=  39.4s
[CV] END ................................ score: (test=0.949) total time=  42.7s
[CV] END ................................ score: (test=0.940) total time=  41.9s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   48.4s finished


In [22]:
logit.fit(X_train_new, y_train)
logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.93843

In [25]:
c_values = np.logspace(-2, 2, 10)
logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=2)
logit_grid_searcher.fit(X_train_new, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .............................................C=0.01; total time=   1.4s
[CV] END .............................................C=0.01; total time=   2.5s
[CV] END .............................................C=0.01; total time=   3.6s
[CV] END .............................................C=0.01; total time=   4.3s
[CV] END .............................C=0.027825594022071243; total time=   1.3s
[CV] END .............................................C=0.01; total time=   7.1s
[CV] END .............................C=0.027825594022071243; total time=   2.7s
[CV] END .............................................C=0.01; total time=   8.3s
[CV] END .............................................C=0.01; total time=   9.4s
[CV] END .............................................C=0.01; total time=   9.7s
[CV] END .............................C=0.027825594022071243; total time=   6.1s
[CV] END .............................C=0.0278



[CV] END ................................C=35.93813663804626; total time= 2.9min
[CV] END ............................................C=100.0; total time= 1.3min
[CV] END ............................................C=100.0; total time= 1.8min
[CV] END ................................C=35.93813663804626; total time= 3.8min
[CV] END ............................................C=100.0; total time= 2.2min
[CV] END ............................................C=100.0; total time= 2.2min




[CV] END ............................................C=100.0; total time= 1.8min
[CV] END ............................................C=100.0; total time= 2.6min
[CV] END ............................................C=100.0; total time= 2.6min


In [27]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9173758280927669, {'C': 0.21544346900318834})

In [26]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94242