In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [4]:
import pickle 

with open('./site_dic.pkl', 'rb') as f:
    site_dict = pickle.load(f)

In [5]:
def get_visits(file):
    df_alice = pd.read_csv(file)
    df_alice.site = df_alice.site.apply(site_dict.get)
    df_alice.timestamp = pd.to_datetime(df_alice.timestamp)

    cnt = df_alice.site.value_counts()
    total = cnt.sum()
    cnt = cnt[cnt > 5] / total

    return {str(k): v for k, v in cnt.to_dict().items()}

alice_weight = get_visits('train/Alice_log.csv')

In [6]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

df_train = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})

In [7]:
for i in range(1, 11):
    s = 'site%d' % i
    df_train[s] = df_train[s].fillna('na')
    
    t = 'time%d' % i
    df_train[t] = pd.to_datetime(df_train[t])

In [8]:
df_train = df_train.sort_values(by='time1').reset_index(drop=True)

In [9]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [48]:
df_train['alice_weight'] = df_train[sites].applymap(alice_weight.get).mean(axis=1).fillna(0)

In [41]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [11]:
df_train['month'] = 12 * (df_train.time1.dt.year - 2013) + df_train.time1.dt.month
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday
df_train['weekend'] = df_train.time1.dt.weekday > 5
df_train['before_noon'] = df_train.hour_start < 12

In [12]:
def calc_hour_end(row):
    last_t = None

    for s, t in zip(sites, times):
        if row[s] != 'na':
            last_t = row[t]
        else:
            break
    
    return last_t.hour

In [13]:
df_train['hour_end'] = df_train.apply(calc_hour_end, axis=1)

In [14]:
df_train_T = df_train[times].T
df_train_T_diff = df_train_T.diff()
df_train_T_diff[df_train_T.isnull()] = np.nan

df_train['diff_min'] = df_train_T_diff.min(axis=0).dt.seconds.fillna(-1)
df_train['diff_std'] = df_train_T_diff.std(axis=0).dt.seconds.fillna(-1)
#df_train['diff_mean'] = df_train_T_diff.mean(axis=0).dt.seconds.fillna(-1)
#df_train['diff_max'] = df_train_T_diff.max(axis=0).dt.seconds.fillna(-1)

In [15]:
has_na = (df_train[sites] == 'na').any(axis=1)
df_train['uniq_sites'] = df_train[sites].nunique(axis=1) - has_na

In [85]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'hour_end', 'weekday', 'weekend',
                                          'before_noon', 'uniq_sites']])

In [86]:
num_scale = ['hour_start']
X_num = df_train[num_scale].values.astype('float')

scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)

num_nonscale = ['month', 'diff_std', 'diff_min']
X_num = np.hstack([X_num, df_train[num_nonscale].values.astype('float')])

In [87]:
y = df_train.target.values
X = sp.hstack([X_ohe, X_time, X_num], format='csr')

In [88]:
n = len(df_train) // 10

Xn = X[int(2*n):]
yn = y[int(2*n):]

i = 3
X_train = Xn[:-i*n]
y_train = yn[:-i*n]

X_test = X[-i*n:]
y_test = y[-i*n:]

In [89]:
C = 1.0

In [90]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, class_weight={0: 1, 1: 3})
svm.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight={0: 1, 1: 3}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=1,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [91]:
y_pred = svm.decision_function(X_test)
roc_auc_score(y_test, y_pred)

0.96908217093921034

In [46]:
y_pred = svm.decision_function(X_test)
roc_auc_score(y_test, y_pred)

0.96893067261338606

In [92]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, class_weight={0: 1, 1: 3})
svm.fit(Xn, yn)

LogisticRegression(C=1.0, class_weight={0: 1, 1: 3}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=1,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

Test

In [93]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [94]:
df_test['alice_weight'] = df_test[sites].applymap(alice_weight.get).mean(axis=1).fillna(0)

In [95]:
df_test['month'] = 12 * (df_test.time1.dt.year - 2013) + df_test.time1.dt.month
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday
df_test['weekend'] = df_test.time1.dt.weekday > 5
df_test['before_noon'] = df_test.hour_start < 12

In [96]:
df_test_T = df_test[times].T
df_test_T_diff = df_test_T.diff()
df_test_T_diff[df_test_T.isnull()] = np.nan

df_test['diff_min'] = df_test_T_diff.min(axis=0).dt.seconds.fillna(-1)
df_test['diff_std'] = df_test_T_diff.std(axis=0).dt.seconds.fillna(-1)
#df_train['diff_mean'] = df_train_T_diff.mean(axis=0).dt.seconds.fillna(-1)
#df_train['diff_max'] = df_train_T_diff.max(axis=0).dt.seconds.fillna(-1)

In [97]:
df_test['hour_end'] = df_test.apply(calc_hour_end, axis=1)

In [98]:
has_na = (df_test[sites] == 'na').any(axis=1)
df_test['uniq_sites'] = df_test[sites].nunique(axis=1) - has_na

In [99]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [100]:
X_test_ohe = cv.transform(df_test.sites)

In [101]:
X_test_time = time_ohe.transform(df_test[['hour_start', 'hour_end', 'weekday', 'weekend',
                                          'before_noon', 'uniq_sites']])

In [102]:
X_test_num = df_test[num_scale].values.astype('float')
X_test_num = scaler.transform(X_test_num)
X_test_num = np.hstack([X_test_num, df_test[num_nonscale].values.astype('float')])

In [103]:
X_test = sp.hstack([X_test_ohe, X_test_time, X_test_num], format='csr')

In [104]:
preds = svm.decision_function(X_test)

In [105]:
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = preds

In [106]:
df_res.to_csv('wow-so-doge-2n-v5.csv', index=False)
!gzip wow-so-doge-2n-v5.csv