In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
from glob import glob
from datetime import datetime

In [4]:
import pickle 

with open('./site_dic.pkl', 'rb') as f:
    site_dict = pickle.load(f)

In [5]:
def add_session(session, data, username, len_session=10):
    item_to_add = ['na', float("NaN")]
    session_limit = len_session * 2
        
    while True:
        if len(session) >= session_limit:
            break
        session += item_to_add
    
    session.append(username)
    data.append(session)

def file_to_df(file_path, len_session=10, first_line_file="timestamp,site", shift_window=False):
    open_file = open(file_path)
    lines = open_file.readlines()
    
    if "/" in file_path:
        username = file_path[file_path.rfind("/") + 1:-4]
    else:
        username = file_path[:-4]
        
    data = list()
    session = list()
    ln_count = 0
    start_session = None
    
    if not shift_window:
        sw = list()
    else:
        sw = shift_window
    
    for i in range(len(lines)):
        ln = lines[i].strip()
        if ln == 'timestamp,site' or i in sw:
            continue

        time, site = ln.split(",")
        site = str(site_dict[site])
        time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")

        if not start_session:
            start_session = time

        time_diff = (time - start_session).total_seconds()
        ln_count += 1

        if time_diff > 1800 or ln_count > 10:
            add_session(session, 
                        data, 
                        username, 
                        len_session=len_session)

            start_session = time
            time_diff = 0
            ln_count = 1
            session = list()

        session += [site, time]

        if i == len(lines) - 1:
            add_session(session, 
                        data, username, 
                        len_session=len_session,)
        
    columns = list()
    for i in range(1, len_session + 1):
        columns += ["site" + str(i), "time" + str(i)]

    columns += ["user"]
    
    return pd.DataFrame.from_records(data, columns=columns)

def get_full_df(shift_count=0):
    files = sorted(glob('train/*.csv') + glob('train/other_user_logs/*.csv'))

    frames = list()
    for i in range(0, shift_count + 1):
        shift_window = list(range(i))
        for file in files:
            df = file_to_df(file, shift_window=shift_window)
            frames.append(df)
    
    return pd.concat(frames)

In [6]:
df_full = get_full_df()

In [7]:
df_full['target'] = (df_full.user == 'Alice_log').astype('uint8')

In [8]:
df_train = df_full.sort_values(by='time1').reset_index(drop=True)

In [9]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

In [10]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [11]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [12]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [15]:
y = df_train.target.values
X_sparse = sp.hstack([X_ohe, X_time], format='csr') 

In [16]:
n = len(df_train) // 10

X_train = X_sparse[:-n]
y_train = y[:-n]

X_test = X_sparse[-n:]
y_test = y[-n:]

In [28]:
C = 1
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X_train, y_train)

y_pred_alice = svm.decision_function(X_test)
roc_auc_score(y_test, y_pred_alice)

0.98141339869281052

In [18]:
y_all = df_train.user.values
y_all_train = y_all[:-n]
y_all_test = y_all[-n:]

In [19]:
%%time

C = 1
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, n_jobs=12, verbose=True)
svm.fit(X_train, y_all_train)

#y_pred = svm.decision_function(X_test)
#roc_auc_score(y_test, y_pred)

[LibLinear]CPU times: user 4h 34min 17s, sys: 6min 29s, total: 4h 40min 47s
Wall time: 1h 10min 14s


In [22]:
y_pred = svm.decision_function(X_test)

In [31]:
from sklearn.preprocessing import StandardScaler

In [32]:
y_pred_s = StandardScaler().fit_transform(y_pred)

In [33]:
y_pred_a = StandardScaler().fit_transform(y_pred_alice)



In [37]:
sim = y_pred_s.T.dot(y_pred_a)

In [40]:
sim.argsort()

array([1138, 1196, 1442, ...,  323,  764,    0])

In [20]:
%%time

C = 1
svm_full = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, n_jobs=12, verbose=True)
svm_full.fit(X_sparse, y_all)

#y_pred = svm.decision_function(X_test)
#roc_auc_score(y_test, y_pred)

[LibLinear]CPU times: user 5h 3min 22s, sys: 7min 17s, total: 5h 10min 39s
Wall time: 1h 17min 43s


Full model

In [21]:
svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1)
svm.fit(X_sparse, y)

LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Test

In [22]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [23]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [24]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [25]:
X_test_ohe = cv.transform(df_test.sites)

X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])

In [26]:
X_test = sp.hstack([X_test_ohe, X_test_time], format='csr')

In [27]:
preds = svm.decision_function(X_test)

In [28]:

df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = preds

In [29]:
df_res.to_csv('doge4.csv', index=False)
!gzip doge4.csv