# Correct time-aware cross-validation scheme

In [1]:
import numpy as np 
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# defining a helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
# reading the data
train_df = pd.read_csv('./data/train_sessions.csv',
                       index_col='session_id', parse_dates=['time1'])

test_df = pd.read_csv(
    './data/test_sessions.csv', index_col='session_id', parse_dates=['time1']
)

In [4]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,,,,,,,,...,,,,,,,,,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [5]:
# sorting the data by time
train_df = train_df.sort_values(by='time1')

In [6]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [7]:
# transforming the data to feed it into the countvectorizer
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv(
    './data/train_sessions_text.txt', sep=' ', index=None, header=None
)

test_df[sites].fillna(0).astype('int').to_csv(
    './data/test_sessions_text.txt', sep=' ', index=None, header=None
)

In [8]:
!head -5 ./data/train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [9]:
%%time
# fitting countvectorizer and transforming the data with it
cv = CountVectorizer(
    ngram_range=(1, 3), max_features=50000
)

with open('./data/train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('./data/test_sessions_text.txt') as inp_test_file:
    X_test = cv.fit_transform(inp_test_file)

CPU times: user 13.1 s, sys: 281 ms, total: 13.4 s
Wall time: 13.5 s


In [10]:
X_train.shape, X_test.shape

((253561, 50000), (82797, 50000))

In [11]:
# saving the training target
y_train = train_df['target'].astype('int').values

In [12]:
time_split = TimeSeriesSplit(n_splits=10)

In [13]:
[
    (el[0].shape, el[1].shape) for el in time_split.split(X_train)
]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [14]:
# performing time series cv with log regression
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [15]:
%%time

cv_scores = cross_val_score(
    logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=1
)

CPU times: user 9min 16s, sys: 2.57 s, total: 9min 18s
Wall time: 1min 23s


In [16]:
cv_scores, cv_scores.mean()

(array([0.83141992, 0.64672188, 0.87991797, 0.96315292, 0.84221721,
        0.87840646, 0.94476054, 0.85322131, 0.92987763, 0.90752885]),
 0.8677224689973594)

In [17]:
logit.fit(X_train, y_train)

In [18]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
logit_test_pred

array([0.00153271, 0.00030883, 0.00065983, ..., 0.0005991 , 0.00118511,
       0.00119602])

In [19]:
write_to_submission_file(logit_test_pred, 'subm1.csv')

In [20]:
# adding additional time features
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [21]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

CPU times: user 1.63 s, sys: 124 ms, total: 1.75 s
Wall time: 1.75 s


In [22]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

In [23]:
%%time
# performing time series cv
cv_scores = cross_val_score(
    logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=1
)


CPU times: user 11min 4s, sys: 3.28 s, total: 11min 8s
Wall time: 1min 42s


In [24]:
cv_scores, cv_scores.mean()

(array([0.87651669, 0.75124453, 0.93061942, 0.97864183, 0.90399667,
        0.93831429, 0.96249244, 0.92731303, 0.94885535, 0.94043869]),
 0.9158432942419917)

In [25]:
logit.fit(X_train_new, y_train)

In [26]:
logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv')

In [27]:
# tuning the regularization parameter C
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(
    estimator=logit, param_grid={"C":c_values}, scoring='roc_auc',
    n_jobs=1, cv=time_split, verbose=1
)




In [None]:
%%time
logit_grid_searcher.fit(X_train_new, y_train) 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

In [None]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv')