In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from tqdm import tqdm_notebook as tqdm

In [3]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
import pickle 

with open('./site_dic.pkl', 'rb') as f:
    site_dict = pickle.load(f)

In [28]:
def get_visits(file):
    df_alice = pd.read_csv(file)
    df_alice.site = df_alice.site.apply(site_dict.get)
    df_alice.timestamp = pd.to_datetime(df_alice.timestamp)

    cnt = df_alice.site.value_counts()
    total = cnt.sum()
    cnt = cnt[cnt > 5] / total

    return {str(k): v for k, v in cnt.to_dict().items()}

alice_weight = get_visits('train/Alice_log.csv')

In [5]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

In [13]:
import pickle

In [14]:
with open('recreated.bin', 'rb') as f:
    df_train = pickle.load(f)

df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [None]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [19]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [30]:
df_train['alice_weight'] = df_train[sites].applymap(alice_weight.get).mean(axis=1).fillna(0)

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [31]:
fnum = ['hour_start', 'alice_weight']
X_num = df_train[fnum].values

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)

In [50]:
y = df_train.target.values
X_sparse = sp.hstack([X_ohe, X_time, X_num], format='csr')
#X_sparse = sp.hstack([X_ohe, X_time], format='csr')

In [51]:
%%time
C = 0.7
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, class_weight={0: 1, 1: 2})
svm.fit(X_sparse, y)

CPU times: user 2min 22s, sys: 3.33 s, total: 2min 26s
Wall time: 37.1 s


Test

In [38]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [39]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [40]:
df_test['alice_weight'] = df_test[sites].applymap(alice_weight.get).mean(axis=1).fillna(0)

In [41]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [52]:
X_test_ohe = cv.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])
X_test_num = scaler.transform(df_test[fnum].values)

In [54]:
X_test = sp.hstack([X_test_ohe, X_test_time, X_test_num], format='csr')
#X_test = sp.hstack([X_test_ohe, X_test_time], format='csr')

In [55]:
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = svm.decision_function(X_test)

In [56]:
df_res.to_csv('lr-full-11.csv', index=False)
!gzip lr-full-11.csv