In [4]:
# import libraries
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [5]:
# a helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

**Read training and test sets, sort train set by session start time.**

In [7]:
train_df = pd.read_csv('../../data/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../../data//test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

FileNotFoundError: ignored

**Transform data into format which can be fed into `CountVectorizer`**

In [None]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [None]:
!head -5 train_sessions_text.txt

**Fit `CountVectorizer` and trasfrom data with it.**

In [None]:
%%time
cv = CountVectorizer()
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

**Save train targets into a separate vector.**

In [None]:
y_train = train_df['target'].astype('int')

### train Logistic regression

In [None]:
logit = LogisticRegression(C=1, random_state=17)

In [None]:
%%time
cv_scores = cross_val_score(logit, X_train, y_train, cv=5, scoring='roc_auc')

In [None]:
cv_scores

In [None]:
cv_scores.mean()

In [None]:
%%time
logit.fit(X_train, y_train)

In [None]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]

In [None]:
test_pred_logit1

In [None]:
## CV 0.885
write_to_submission_file(test_pred_logit1, 'logit_subm1.txt') ## .908 ROC AUC Public LB

### Time Features
 - hour when the session started
 - morning 
 - day
 - eve
 - night

In [None]:
def add_time_features(time1_series, X_sparse):
    hour = time1_series.apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [None]:
test_df.loc[:, 'time1'].fillna(0).apply(lambda ts: ts.hour).head()

In [None]:
%%time
X_train_with_time = add_time_features(train_df['time1'].fillna(0), X_train)
X_test_with_time = add_time_features(test_df['time1'].fillna(0), X_test)

In [None]:
X_train_with_time.shape, X_test_with_time.shape

In [None]:
%%time
cv_scores = cross_val_score(logit, X_train_with_time, y_train, cv=6, scoring='roc_auc')

In [None]:
cv_scores

In [None]:
cv_scores.mean()

In [None]:
%%time
logit.fit(X_train_with_time, y_train)

In [None]:
test_pred_logit2 = logit.predict_proba(X_test_with_time)[:, 1]

In [None]:
test_pred_logit2

In [None]:
## CV .9307
write_to_submission_file(test_pred_logit2, 'logit_subm2.txt') ## ROC AUC 0.93565 Public LB