In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [9]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
sites = ['site%s' % i for i in range(1, 11)]

df_train = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})

In [5]:
for i in range(1, 11):
    s = 'site%d' % i
    df_train[s] = df_train[s].fillna('na')
    
    t = 'time%d' % i
    df_train[t] = pd.to_datetime(df_train[t])

In [6]:
df_train = df_train.sort_values(by='time1').reset_index(drop=True)

In [7]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [19]:
cv = CountVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'})
X_ohe = cv.fit_transform(df_train.sites)

In [24]:
cv = CountVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [29]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [62]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [63]:
from sklearn.preprocessing import OneHotEncoder

In [64]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [66]:
y = df_train.target.values
X = sp.hstack([X_ohe, X_time, df_train[['hour_start']]], format='csr')

In [67]:
n = len(df_train) // 10

X_train = X[:-n]
y_train = y[:-n]

X_val = X[-n:]
y_val = y[-n:]

In [68]:
for C in [0.01, 0.1, 0.5, 1, 5]:
    t0 = time()

    svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
    svm.fit(X_train, y_train)

    y_pred = svm.decision_function(X_val)
    auc = roc_auc_score(y_val, y_pred)

    print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))

C=0.01, took 1.593s, auc=0.960
C=0.1, took 2.597s, auc=0.975
C=0.5, took 4.062s, auc=0.980
C=1, took 4.334s, auc=0.981
C=5, took 6.926s, auc=0.981


In [87]:
for C in [0.01, 0.1, 0.5, 1, 5]:
    t0 = time()

    lr1 = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
    lr1.fit(X_train, y_train)

    lr2 = LogisticRegression(penalty='l1', dual=False, C=C, random_state=1)
    lr2.fit(X_train, y_train)

    y_pred = lr1.decision_function(X_val) + lr2.decision_function(X_val)
    auc = roc_auc_score(y_val, y_pred)

    print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))

C=0.01, took 4.300s, auc=0.962
C=0.1, took 15.443s, auc=0.971


KeyboardInterrupt: 

In [69]:
C = 1.0
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [76]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [77]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [78]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [79]:
X_test_ohe = cv.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])

In [80]:
X_test = sp.hstack([X_test_ohe, X_test_time, df_test[['hour_start']]], format='csr')

In [81]:
pred = svm.decision_function(X_test)
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = pred

In [83]:
df_res.to_csv('benchmark11.csv', index=False)
!gzip benchmark11.csv

- CV: 0.928, LB: 0.92081
- CV: 0.981, LB: 0.94803