In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

df_train = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})

In [4]:
for i in range(1, 11):
    s = 'site%d' % i
    df_train[s] = df_train[s].fillna('na')
    
    t = 'time%d' % i
    df_train[t] = pd.to_datetime(df_train[t])

In [5]:
df_train_T = df_train[times].T
df_train_T_diff = df_train_T.diff()
df_train_T_diff[df_train_T.isnull()] = np.nan

df_train['diff_min'] = df_train_T_diff.min(axis=0).dt.seconds.fillna(-1)
df_train['diff_std'] = df_train_T_diff.std(axis=0).dt.seconds.fillna(-1)
df_train['diff_mean'] = df_train_T_diff.mean(axis=0).dt.seconds.fillna(-1)
df_train['diff_max'] = df_train_T_diff.max(axis=0).dt.seconds.fillna(-1)

In [6]:
df_train = df_train.sort_values(by='time1').reset_index(drop=True)

In [7]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [8]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [9]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [12]:
X_num = df_train[['hour_start', 'diff_min', 'diff_max', 'diff_std', 'diff_mean']].values

In [13]:
y = df_train.target.values
X_sparse = sp.hstack([X_ohe, X_time], format='csr')

In [14]:
n = len(df_train) // 10

X_train = X_sparse[:-n]
y_train = y[:-n]

X_test = X_sparse[-n:]
y_test = y[-n:]

In [15]:
X_train_num = X_num[:-n]
X_test_num = X_num[-n:]

In [16]:
from sklearn.model_selection import KFold

In [17]:
import seaborn as sns
%matplotlib inline

In [18]:
kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf = list(kf.split(X_train, y_train))

In [19]:
preds_lr = np.zeros_like(y_train, dtype='float32')

C = 1.0

for train, val in kf:
    t0 = time()

    svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
    svm.fit(X_train[train], y_train[train])

    y_pred = svm.decision_function(X_train[val])
    preds_lr[val] = y_pred
    auc = roc_auc_score(y_train[val], y_pred)

    print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))

C=1.0, took 2.414s, auc=0.970
C=1.0, took 2.292s, auc=0.977
C=1.0, took 2.379s, auc=0.974
C=1.0, took 2.235s, auc=0.987
C=1.0, took 2.364s, auc=0.979
C=1.0, took 2.308s, auc=0.975
C=1.0, took 2.392s, auc=0.984
C=1.0, took 2.172s, auc=0.982
C=1.0, took 2.325s, auc=0.983
C=1.0, took 2.052s, auc=0.980


In [20]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X_train, y_train)

pred_lr_test = svm.decision_function(X_test)

In [29]:
roc_auc_score(y_test, pred_lr_test)

0.98141339869281052

In [21]:
from sklearn.decomposition import TruncatedSVD

In [22]:
svd = TruncatedSVD(n_components=100, random_state=1)
X_svd = svd.fit_transform(X_ohe)

In [23]:
X_et = np.hstack([X_svd, X_time.toarray(), X_num])

In [24]:
X_et_train = X_et[:-n]
X_et_test = X_et[-n:]


In [25]:
from sklearn.ensemble import ExtraTreesClassifier

In [27]:
et_params2 = dict(
    n_estimators=50,
    criterion='gini',
    max_depth=50,
    min_samples_split=6,
    min_samples_leaf=6,
    max_features=10,
    bootstrap=False, 
    n_jobs=-1,
    random_state=1
)

In [28]:
preds_et = np.zeros_like(y_train, dtype='float32')

for train, val in kf:
    t0 = time()

    et = ExtraTreesClassifier(**et_params2)
    et.fit(X_et_train[train], y_train[train])

    y_pred = et.predict_proba(X_et_train[val])[:, 1]
    preds_et[val] = y_pred
    auc = roc_auc_score(y_train[val], y_pred)

    print('took %.3fs, auc=%.3f' % (time() - t0, auc))

took 9.638s, auc=0.972
took 9.528s, auc=0.979
took 9.638s, auc=0.980
took 9.928s, auc=0.987
took 9.429s, auc=0.981
took 9.733s, auc=0.977
took 9.823s, auc=0.988
took 9.688s, auc=0.984
took 9.720s, auc=0.982
took 9.521s, auc=0.982


In [32]:
et = ExtraTreesClassifier(**et_params2)
et.fit(X_et_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=50, max_features=10, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=6,
           min_samples_split=6, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [33]:
pred_et_test = et.predict_proba(X_et_test)[:, 1]
roc_auc_score(y_test, pred_et_test)

0.96149450325831243

In [26]:
et_params = dict(
    n_estimators=100,
    criterion='gini',
    max_depth=15,
    min_samples_split=6,
    min_samples_leaf=6,
    max_features=2,
    bootstrap=False, 
    n_jobs=-1,
    random_state=1
)

In [74]:
X_second = np.hstack([preds.reshape(-1, 1), X_train_num])

In [82]:
X_second_test = np.hstack([pred_test.reshape(-1, 1), X_test_num])

In [77]:
et = ExtraTreesClassifier(**et_params)
et.fit(X_second, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=15, max_features=2, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=6,
           min_samples_split=6, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [86]:
y_pred = et.predict_proba(X_second_test)[:, 1]
roc_auc_score(y_test, y_pred)

0.98274939133081174

Full model

In [87]:
kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf = list(kf.split(X, y))

In [89]:
preds = np.zeros_like(y, dtype='float32')

C = 1.0

for train, val in kf:
    svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
    svm.fit(X[train], y[train])

    preds[val] = svm.decision_function(X[val])

In [90]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [91]:
X_second = np.hstack([preds.reshape(-1, 1), X_num])

In [92]:
et = ExtraTreesClassifier(**et_params)
et.fit(X_second, y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=15, max_features=2, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=6,
           min_samples_split=6, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

Test

In [93]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [94]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [95]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [96]:
X_test_ohe = cv.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])

In [97]:
X_test = sp.hstack([X_test_ohe, X_test_time], format='csr')

In [98]:
preds = svm.decision_function(X_test)

In [99]:
df_test_T = df_test[times].T
df_test_T_diff = df_test_T.diff()
df_test_T_diff[df_test_T.isnull()] = np.nan

df_test['diff_min'] = df_test_T_diff.min(axis=0).dt.seconds.fillna(-1)
df_test['diff_std'] = df_test_T_diff.std(axis=0).dt.seconds.fillna(-1)
df_test['diff_mean'] = df_test_T_diff.mean(axis=0).dt.seconds.fillna(-1)
df_test['diff_max'] = df_test_T_diff.max(axis=0).dt.seconds.fillna(-1)

In [100]:
X_num_test = df_test[['hour_start', 'diff_min', 'diff_max', 'diff_std', 'diff_mean']].values

In [101]:
X_second_test = np.hstack([preds.reshape(-1, 1), X_num_test])

In [102]:

df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = et.predict_proba(X_second_test)[:, 1]

In [103]:
df_res.to_csv('et01.csv', index=False)
!gzip et01.csv