In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [5]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

df_train = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

In [11]:
df_train['src'] = 0
df_test['src'] = 1
df_all = pd.concat([df_train, df_test]).reset_index(drop=True)

In [12]:
for i in range(1, 11):
    s = 'site%d' % i
    df_all[s] = df_all[s].fillna('na')

    t = 'time%d' % i
    df_all[t] = pd.to_datetime(df_all[t])

In [13]:
df_all = df_all.sort_values(by='time1').reset_index(drop=True)

In [14]:
df = df_all
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [15]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 1))
cv.fit(df_all.sites)

In [16]:
df_all['month'] = 12 * (df_all.time1.dt.year - 2013) + df_all.time1.dt.month
df_all['hour_start'] = df_all.time1.dt.hour
df_all['weekday'] = df_all.time1.dt.weekday
df_all['weekend'] = df_all.time1.dt.weekday > 5
df_all['before_noon'] = df_all.hour_start < 12

In [17]:
def calc_hour_end(row):
    last_t = None

    for s, t in zip(sites, times):
        if row[s] != 'na':
            last_t = row[t]
        else:
            break
    
    return last_t.hour

In [22]:
df_all['hour_end'] = df_all.apply(calc_hour_end, axis=1)

In [18]:
df_all_T = df_all[times].T
df_all_T_diff = df_all_T.diff()
df_all_T_diff[df_all_T.isnull()] = np.nan

df_all['diff_min'] = df_all_T_diff.min(axis=0).dt.seconds.fillna(-1)
df_all['diff_std'] = df_all_T_diff.std(axis=0).dt.seconds.fillna(-1)
#df_train['diff_mean'] = df_train_T_diff.mean(axis=0).dt.seconds.fillna(-1)
#df_train['diff_max'] = df_train_T_diff.max(axis=0).dt.seconds.fillna(-1)

In [19]:
has_na = (df_all[sites] == 'na').any(axis=1)
df_all['uniq_sites'] = df_all[sites].nunique(axis=1) - has_na

In [47]:
df_train = df_all[df_all.src == 0].reset_index(drop=True)
df_test = df_all[df_all.src == 1].reset_index(drop=True)

In [30]:
X_ohe = cv.transform(df_train.sites)

In [65]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'hour_end', 'weekday', 'weekend',
                                          'before_noon', 'uniq_sites']])

In [66]:
num_scale = ['hour_start']
X_num = df_train[num_scale].values.astype('float')

scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)

num_nonscale = ['diff_std', 'diff_min']
X_num = np.hstack([X_num, df_train[num_nonscale].values.astype('float')])

In [67]:
y = df_train.target.values
X = sp.hstack([X_ohe, X_time, X_num], format='csr')

In [68]:
n = len(df_train) // 10

Xn = X[int(3*n):]
yn = y[int(3*n):]

i = 3
X_train = Xn[:-i*n]
y_train = yn[:-i*n]

X_test = X[-i*n:]
y_test = y[-i*n:]

In [69]:
C = 1.0

In [70]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, class_weight={0: 1, 1: 2})
svm.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight={0: 1, 1: 2}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=1,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [71]:
y_pred = svm.decision_function(X_test)
roc_auc_score(y_test, y_pred)

0.9725782351910055

In [35]:
y_pred = svm.decision_function(X_test)
roc_auc_score(y_test, y_pred)

0.9684707074939638

In [35]:
y_pred = svm.decision_function(X_test)
roc_auc_score(y_test, y_pred)

0.9684707074939638

In [72]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, class_weight={0: 1, 1: 3})
svm.fit(Xn, yn)

LogisticRegression(C=1.0, class_weight={0: 1, 1: 3}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=1,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

Test

In [73]:
X_test_ohe = cv.transform(df_test.sites)

In [74]:
X_test_time = time_ohe.transform(df_test[['hour_start', 'hour_end', 'weekday', 'weekend',
                                          'before_noon', 'uniq_sites']])

In [75]:
X_test_num = df_test[num_scale].values.astype('float')
X_test_num = scaler.transform(X_test_num)
X_test_num = np.hstack([X_test_num, df_test[num_nonscale].values.astype('float')])

In [76]:
X_test = sp.hstack([X_test_ohe, X_test_time, X_test_num], format='csr')

In [77]:
preds = svm.decision_function(X_test)

In [78]:
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = preds

In [79]:
df_res.to_csv('comb-3n-nom.csv', index=False)
!gzip comb-3n-nom.csv