In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from tqdm import tqdm_notebook as tqdm

In [3]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
import pickle 

with open('./site_dic.pkl', 'rb') as f:
    site_dict = pickle.load(f)

In [5]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

In [6]:
def roll(df, window_size, step=1):
    n = len(df)
    for i in range(0, n, step):
        yield df.iloc[i:i+window_size]

In [7]:
colnames = []
for i in range(1, 11):
    colnames.append('time%d' % i)
    colnames.append('site%d' % i)

In [8]:
def batch_to_line(batch):
    line = []

    for t, s in zip(batch.timestamp, batch.site):
        line.append(t)
        line.append(str(s))

    l = len(line) // 2
    if l < 10:
        need = 10 - l
        line.extend([None, 'na'] * need)

    return line

In [9]:
def read_file(file_name):
    df = pd.read_csv(file_name)
    df.site = df.site.apply(site_dict.get)
    df.timestamp = pd.to_datetime(df.timestamp)

    df['delta'] = df.timestamp - df.timestamp.shift()

    half_hour = 30 * 60
    session_change = df.delta.dt.seconds >= half_hour

    df['session_id'] = session_change.cumsum().astype(int)

    groups = df.groupby('session_id')

    lines = []
    for sid, group in groups:

        windows = roll(group, 10)
        for win in windows:
            line = batch_to_line(win)
            lines.append(line)

    df_res = pd.DataFrame(lines, columns=colnames)

    user = file_name.split('/')[-1][:-4].lower()
    df_res['user'] = user

    return df_res

In [10]:
from glob import glob

In [11]:
all_files = sorted(glob('train/*.csv') + glob('train/other_user_logs/*.csv'))


In [12]:
from concurrent.futures import ProcessPoolExecutor

In [13]:
with ProcessPoolExecutor() as pool:
    progress = tqdm(total=len(all_files))
   
    futures = []
 
    for file in all_files:
        future = pool.submit(read_file, file)
        future.add_done_callback(lambda x: progress.update())
        futures.append(future)
 
    results = []
    for f in futures:
        results.append(f.result())
 
    progress.close()




In [14]:
df_train = pd.concat(results)
df_train = df_train.reset_index(drop=True)

In [15]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [17]:
df_train['target'] = (df_train.user == 'alice_log').astype('uint8')

In [46]:
with open('recreated.bin', 'wb') as f:
    pickle.dump(df_train, f)

In [19]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [48]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

df_train_prep = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_train_prep[s] = df_train_prep[s].fillna('na')
    
    t = 'time%d' % i
    df_train_prep[t] = pd.to_datetime(df_train_prep[t])

df_train_prep = df_train_prep.sort_values(by='time1').reset_index(drop=True)

In [49]:
df = df_train_prep
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [66]:
df_train_prep['hour_start'] = df_train_prep.time1.dt.hour
df_train_prep['weekday'] = df_train_prep.time1.dt.weekday

In [50]:
n = len(df_train_prep) // 10
val_ts = df_train_prep.iloc[-n].time1

In [73]:
df_val = df_train_prep[df_train_prep.time1 >= val_ts].reset_index(drop=True)

In [18]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [75]:
X_ohe_val = cv.transform(df_val.sites)

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [76]:
X_time_val = time_ohe.transform(df_val[['hour_start', 'weekday']])

In [60]:
train_mask = (df_train.time1 < val_ts).values

In [63]:
y = df_train.target.values
X_sparse = sp.hstack([X_ohe, X_time], format='csr')

In [64]:
y_train = y[train_mask]
X_sparse_train = X_sparse[train_mask]

In [81]:
y_val = df_val.target.values
X_sparse_val = sp.hstack([X_ohe_val, X_time_val], format='csr')

In [69]:
%%time
C = 0.5
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X_sparse_train, y_train)

CPU times: user 3min 31s, sys: 5.17 s, total: 3min 37s
Wall time: 59.3 s


In [82]:
y_pred = svm.decision_function(X_sparse_val)

In [83]:
roc_auc_score(y_val, y_pred)

0.98129684845103482

Test

In [27]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [28]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [29]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [30]:
X_test_ohe = cv.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])

In [31]:
X_test = sp.hstack([X_test_ohe, X_test_time], format='csr')

In [40]:
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = svm.decision_function(X_test)

In [41]:
df_res.to_csv('lr-full-03.csv', index=False)
!gzip lr-full-03.csv