In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

In [2]:
from tqdm import tqdm_notebook as tqdm

In [3]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
import pickle 

with open('./site_dic.pkl', 'rb') as f:
    site_dict = pickle.load(f)

In [5]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%d' % i for i in range(1, 11)]

In [6]:
import pickle

In [7]:
with open('recreated.bin', 'rb') as f:
    df_train = pickle.load(f)

df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [8]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [9]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
num_f = ['hour_start']
X_num = df_train[num_f].values

scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)



In [14]:
y = df_train.target.values
X_all_sp = sp.hstack([X_ohe, X_time, X_num], format='csr')
X_sparse = sp.hstack([X_ohe, X_time], format='csr')

In [15]:
from sklearn.decomposition import TruncatedSVD

In [16]:
from sklearn.model_selection import KFold

In [17]:
kf = KFold(n_splits=3, shuffle=True, random_state=1)
kf = list(kf.split(y))

In [26]:
%%time

C = 0.7

lr_preds = np.zeros_like(y)

for train, val in kf:
    svm = LogisticRegression(penalty='l2', dual=False, C=C, random_state=1, class_weight={0: 1, 1: 2})
    svm.fit(X_all_sp[train], y[train])

    lr_preds[val] = svm.decision_function(X_all_sp[val])
    auc = 1 - roc_auc_score(y[val], lr_preds[val])

    print(auc)

0.0796028273713
0.0880304237771
0.0789285060859
CPU times: user 9min 17s, sys: 15.3 s, total: 9min 32s
Wall time: 2min 26s


In [27]:
svm.decision_function(X_all_sp[val])

array([ 4.55015761,  3.43939876,  2.08715628, ..., -1.89919592,
       -2.20399955, -2.28901872])

In [36]:
1 - roc_auc_score(y[val], lr_preds[val])

0.92107149391410037

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
lr = LabelEncoder()
y_all = lr.fit_transform(df_train.user.values)

In [20]:
svd = TruncatedSVD(n_components=120, random_state=1)
X_svd = svd.fit_transform(X_sparse)

In [21]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU, ELU, LeakyReLU
from keras.callbacks import EarlyStopping, Callback

Using Theano backend.


In [22]:
from sklearn.metrics import log_loss

In [53]:
class WatchlistCallback(Callback):
    def __init__(self, watchlist):
        super(Callback, self).__init__()
        self.X, self.y = watchlist

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X, verbose=0)

        print("epoch no %d" % (epoch), end=', ')
        auc = roc_auc_score(self.y == 0, y_pred[:, 0])
    
        print('logloss=%.4f, auc=%.4f' % (ll, auc))


In [24]:
input_dim = X_svd.shape[1]
output_dim = len(lr.classes_)

In [46]:
for train, val in kf:
    X_train = X_svd[train]
    y_train = y_all[train]
    X_val= X_svd[val]
    y_val = y_all[val]
    break

In [47]:
watchlist = WatchlistCallback(watchlist=(X_val, y_val))

In [30]:
from keras_tqdm import TQDMNotebookCallback

In [61]:
model = Sequential()

#model.add(Dense(input_dim=input_dim, units=350, kernel_initializer='glorot_uniform')) 
#model.add(PReLU())
#model.add(Dropout(0.2))

#model.add(BatchNormalization())

#model.add(Dense(units=output_dim, kernel_initializer='glorot_uniform')) 

model.add(Dense(input_dim=input_dim, units=output_dim, kernel_initializer='glorot_uniform')) 
model.add(Activation('sigmoid'))

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.001))

In [62]:
epochs = 1000

In [63]:
model.fit(X_train, y_train, epochs=epochs, 
          batch_size=50000, callbacks=[watchlist, TQDMNotebookCallback()], verbose=0)

KeyboardInterrupt: 

In [44]:
model.fit(X_train, y_train, epochs=epochs, 
          batch_size=50000, callbacks=[watchlist, TQDMNotebookCallback()], verbose=0)

          100000/|/[loss: 7.324]   6%|| 100000/1608586 [00:31<02:51, 8793.04it/s]          50000/|/[loss: 7.282]   3%|| 50000/1608586 [00:16<02:49, 9209.24it/s]

INFO (theano.gof.compilelock): Refreshing lock /home/agrigorev/.theano/compiledir_Linux-4.2--generic-x86_64-with-debian-jessie-sid-x86_64-3.5.2-64/lock_dir/lock


1600000/|/[loss: 7.276]  99%|| 1600000/1608586 [03:03<00:00, 9850.67it/s]epoch no 0, 

ValueError: y_true and y_pred contain different number of classes 2, 1558. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]

In [57]:
del p

In [58]:
import gc

In [60]:
gc.collect()

33050

Test

In [38]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [39]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [40]:
df_test['alice_weight'] = df_test[sites].applymap(alice_weight.get).mean(axis=1).fillna(0)

In [41]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [52]:
X_test_ohe = cv.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])
X_test_num = scaler.transform(df_test[fnum].values)

In [54]:
X_test = sp.hstack([X_test_ohe, X_test_time, X_test_num], format='csr')
#X_test = sp.hstack([X_test_ohe, X_test_time], format='csr')

In [55]:
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = svm.decision_function(X_test)

In [56]:
df_res.to_csv('lr-full-11.csv', index=False)
!gzip lr-full-11.csv