In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
sites = ['site%s' % i for i in range(1, 11)]

df_train = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})

In [4]:
for i in range(1, 11):
    s = 'site%d' % i
    df_train[s] = df_train[s].fillna('na')
    
    t = 'time%d' % i
    df_train[t] = pd.to_datetime(df_train[t])

In [5]:
df_train = df_train.sort_values(by='time1').reset_index(drop=True)

In [6]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [63]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3), norm=None)
X_ohe = cv.fit_transform(df_train.sites)

In [64]:
y = df_train.target.values
X = X_ohe

In [65]:
n = len(df_train) // 10

X_train = X[:-n]
y_train = y[:-n]

X_val = X[-n:]
y_val = y[-n:]

In [68]:
A = X_train[y_train == 1]

In [110]:
score = X_val.dot(A.T)

In [111]:
score = np.asarray(score.sum(axis=1))[:, 0]

In [108]:
score = score.max(axis=1).toarray()[:, 0]

In [109]:
roc_auc_score(y_val, score)

0.83232318703534625

In [112]:
from sklearn.decomposition import TruncatedSVD

In [113]:
svd = TruncatedSVD(n_components=100, random_state=1)
svd.fit(X)

X_train_svd = svd.transform(X_train)
X_val_svd = svd.transform(X_val)

In [68]:
for C in [0.01, 0.1, 0.5, 1, 5]:
    t0 = time()

    svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
    svm.fit(X_train, y_train)

    y_pred = svm.decision_function(X_val)
    auc = roc_auc_score(y_val, y_pred)

    print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))

C=0.01, took 1.593s, auc=0.960
C=0.1, took 2.597s, auc=0.975
C=0.5, took 4.062s, auc=0.980
C=1, took 4.334s, auc=0.981
C=5, took 6.926s, auc=0.981


In [76]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [77]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [78]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [79]:
X_test_ohe = cv.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])

In [80]:
X_test = sp.hstack([X_test_ohe, X_test_time, df_test[['hour_start']]], format='csr')

In [81]:
pred = svm.decision_function(X_test)
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = pred

In [83]:
df_res.to_csv('benchmark11.csv', index=False)
!gzip benchmark11.csv

- CV: 0.928, LB: 0.92081
- CV: 0.981, LB: 0.94803