In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

df_train = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})

In [4]:
for i in range(1, 11):
    s = 'site%d' % i
    df_train[s] = df_train[s].fillna('na')
    
    t = 'time%d' % i
    df_train[t] = pd.to_datetime(df_train[t])

In [5]:
df_train_T = df_train[times].T
df_train_T_diff = df_train_T.diff()
df_train_T_diff[df_train_T.isnull()] = np.nan

df_train['diff_min'] = df_train_T_diff.min(axis=0).dt.seconds.fillna(-1)
df_train['diff_std'] = df_train_T_diff.std(axis=0).dt.seconds.fillna(-1)
df_train['diff_mean'] = df_train_T_diff.mean(axis=0).dt.seconds.fillna(-1)
df_train['diff_max'] = df_train_T_diff.max(axis=0).dt.seconds.fillna(-1)

In [6]:
df_train = df_train.sort_values(by='time1').reset_index(drop=True)

In [7]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [10]:
cv_tfidf13 = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_tfidf13 = cv_tfidf13.fit_transform(df_train.sites)

In [13]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [16]:
X_num = df_train[['hour_start', 'diff_min', 'diff_max', 'diff_std', 'diff_mean']].values

In [38]:
y = df_train.target.values
X_sparse = sp.hstack([X_tfidf13, X_time], format='csr')

In [26]:
n = len(df_train) // 10

y_train = y[:-n]
y_test = y[-n:]

In [37]:
X_train = X_sparse[:-n]
X_test = X_sparse[-n:]

In [27]:
X_train_num = X_num[:-n]
X_test_num = X_num[-n:]

In [28]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf = list(kf.split(y_train))

In [62]:
import seaborn as sns
%matplotlib inline

In [60]:
preds = np.zeros_like(y_train, dtype='float32')

C = 1.0

for train, val in kf:
    t0 = time()

    svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
    svm.fit(X_train[train], y_train[train])

    y_pred = svm.decision_function(X_train[val])
    preds[val] = y_pred
    auc = roc_auc_score(y_train[val], y_pred)

    print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))

C=1.0, took 6.761s, auc=0.978
C=1.0, took 6.670s, auc=0.983
C=1.0, took 6.250s, auc=0.978
C=1.0, took 7.303s, auc=0.988
C=1.0, took 8.238s, auc=0.986
C=1.0, took 8.427s, auc=0.984
C=1.0, took 7.116s, auc=0.989
C=1.0, took 6.131s, auc=0.987
C=1.0, took 7.183s, auc=0.985
C=1.0, took 6.913s, auc=0.982


In [61]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X_train, y_train)

pred_test = svm.decision_function(X_test)

In [39]:
from sklearn.decomposition import TruncatedSVD

In [40]:
svd = TruncatedSVD(n_components=100, random_state=1)
X_svd = svd.fit_transform(X_tfidf13)

In [46]:
X_et = np.hstack([X_svd, X_time.toarray()])

In [64]:
from sklearn.ensemble import ExtraTreesClassifier

In [53]:
et_params2 = dict(
    n_estimators=50,
    criterion='gini',
    max_depth=50,
    min_samples_split=6,
    min_samples_leaf=6,
    max_features=10,
    bootstrap=False, 
    n_jobs=-1,
    random_state=1
)

In [54]:
preds_et = np.zeros_like(y_train, dtype='float32')

for train, val in kf:
    t0 = time()

    et = ExtraTreesClassifier(**et_params2)
    et.fit(X_et[train], y_train[train])

    y_pred = et.predict_proba(X_et[val])[:, 1]
    preds_et[val] = y_pred
    auc = roc_auc_score(y_train[val], y_pred)

    print('took %.3fs, auc=%.3f' % (time() - t0, auc))

took 10.132s, auc=0.975
took 10.105s, auc=0.982
took 10.211s, auc=0.981
took 10.215s, auc=0.988
took 10.031s, auc=0.980
took 10.043s, auc=0.978
took 10.229s, auc=0.987
took 10.241s, auc=0.985
took 10.124s, auc=0.988
took 10.033s, auc=0.985


In [64]:
et = ExtraTreesClassifier(**et_params2)
et.fit(X_et[:-n], y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=50, max_features=10, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=6,
           min_samples_split=6, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [65]:
pred_et_test = et.predict_proba(X_et[-n:])[:, 1]

In [83]:
et_params = dict(
    n_estimators=100,
    criterion='gini',
    max_depth=5,
    min_samples_split=6,
    min_samples_leaf=6,
    max_features=3,
    bootstrap=False, 
    n_jobs=-1,
    random_state=1
)

In [70]:
X_second = np.hstack([preds.reshape(-1, 1), preds_et.reshape(-1, 1), X_train_num])

In [71]:
X_second_test = np.hstack([pred_test.reshape(-1, 1), pred_et_test.reshape(-1, 1),
                           X_test_num])

In [84]:
et = ExtraTreesClassifier(**et_params)
et.fit(X_second, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=5, max_features=3, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=6,
           min_samples_split=6, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [85]:
y_pred = et.predict_proba(X_second_test)[:, 1]
roc_auc_score(y_test, y_pred)

0.9780442064113154

In [86]:
y_pred = et.predict_proba(X_second_test)[:, 1]
roc_auc_score(y_test, y_pred)

0.98274939133081174

Full model

In [87]:
kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf = list(kf.split(X, y))

In [89]:
preds = np.zeros_like(y, dtype='float32')

C = 1.0

for train, val in kf:
    svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
    svm.fit(X[train], y[train])

    preds[val] = svm.decision_function(X[val])

In [90]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [91]:
X_second = np.hstack([preds.reshape(-1, 1), X_num])

In [92]:
et = ExtraTreesClassifier(**et_params)
et.fit(X_second, y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=15, max_features=2, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=6,
           min_samples_split=6, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

Test

In [93]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [94]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [95]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [96]:
X_test_ohe = cv_tfidf13.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])

In [97]:
X_test = sp.hstack([X_test_ohe, X_test_time], format='csr')

In [98]:
preds = svm.decision_function(X_test)

In [99]:
df_test_T = df_test[times].T
df_test_T_diff = df_test_T.diff()
df_test_T_diff[df_test_T.isnull()] = np.nan

df_test['diff_min'] = df_test_T_diff.min(axis=0).dt.seconds.fillna(-1)
df_test['diff_std'] = df_test_T_diff.std(axis=0).dt.seconds.fillna(-1)
df_test['diff_mean'] = df_test_T_diff.mean(axis=0).dt.seconds.fillna(-1)
df_test['diff_max'] = df_test_T_diff.max(axis=0).dt.seconds.fillna(-1)

In [100]:
X_num_test = df_test[['hour_start', 'diff_min', 'diff_max', 'diff_std', 'diff_mean']].values

In [101]:
X_second_test = np.hstack([preds.reshape(-1, 1), X_num_test])

In [102]:

df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = et.predict_proba(X_second_test)[:, 1]

In [103]:
df_res.to_csv('et01.csv', index=False)
!gzip et01.csv