## Imports

In [135]:
import pandas as pd
import numpy as np
import pickle

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

## General functions

In [2]:
# функция для записи прогнозов в файл
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9):
    # разделим выборку на обучающую и валидационную
    idx = round(X.shape[0] * ratio)
    # обучение классификатора
    lr = LogisticRegression(C=C, n_jobs=-1).fit(X[:idx, :], y[:idx])
    # прогноз для валидационной выборки
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # считаем качество
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

## Load data

In [4]:
train_df = pd.read_csv("../../data/intruder/train_sessions.csv")
test_df = pd.read_csv("../../data/intruder/test_sessions.csv")

In [5]:
times = ['time%s' % i for i in range(1, 11)] 
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

train_df = train_df.sort_values(by='time1')
sorted_test_df = test_df.sort_values(by='time1')

In [6]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype(np.int32)
test_df[sites] = test_df[sites].fillna(0).astype(np.int32)

In [7]:
train_df.tail()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
12223,12224,50,2014-04-30 23:33:48,50,2014-04-30 23:33:49,48,2014-04-30 23:33:52,49,2014-04-30 23:33:52,48,...,2014-04-30 23:33:53,52,2014-04-30 23:33:54,49,2014-04-30 23:33:54,303,2014-04-30 23:33:57,304,2014-04-30 23:34:00,0
164437,164438,4207,2014-04-30 23:34:15,753,2014-04-30 23:34:16,753,2014-04-30 23:34:17,52,2014-04-30 23:34:18,50,...,2014-04-30 23:35:16,3346,2014-04-30 23:35:29,3359,2014-04-30 23:36:12,3346,2014-04-30 23:36:42,38,2014-04-30 23:37:13,0
12220,12221,52,2014-04-30 23:38:08,3346,2014-04-30 23:38:10,784,2014-04-30 23:38:13,784,2014-04-30 23:38:18,3346,...,2014-04-30 23:38:24,3324,2014-04-30 23:38:35,7330,2014-04-30 23:38:35,3594,2014-04-30 23:38:35,3329,2014-04-30 23:38:36,0
156967,156968,3328,2014-04-30 23:38:36,3324,2014-04-30 23:38:36,3599,2014-04-30 23:38:38,3413,2014-04-30 23:38:38,753,...,2014-04-30 23:38:40,3599,2014-04-30 23:38:40,3359,2014-04-30 23:39:07,3359,2014-04-30 23:39:08,3346,2014-04-30 23:39:53,0
204761,204762,222,2014-04-30 23:39:53,3346,2014-04-30 23:39:59,3346,2014-04-30 23:40:00,3359,2014-04-30 23:40:05,55,...,2014-04-30 23:40:05,3346,2014-04-30 23:40:05,0,NaT,0,NaT,0,NaT,0


In [8]:
y_train_df = train_df['target']
train_df = train_df.drop('target', axis=1)

In [10]:
test_df.tail()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
82792,82793,812,2014-10-02 18:20:09,1039,2014-10-02 18:20:09,676,2014-10-02 18:20:09,0,NaT,0,...,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
82793,82794,300,2014-05-26 14:16:40,302,2014-05-26 14:16:41,302,2014-05-26 14:16:44,300,2014-05-26 14:16:44,300,...,1222,2014-05-26 14:17:19,302,2014-05-26 14:17:19,1218,2014-05-26 14:17:19,1221,2014-05-26 14:17:19,1216,2014-05-26 14:17:19
82794,82795,29,2014-05-02 11:21:56,33,2014-05-02 11:21:56,35,2014-05-02 11:21:56,22,2014-05-02 11:22:03,37,...,6779,2014-05-02 11:22:03,30,2014-05-02 11:22:03,21,2014-05-02 11:22:04,23,2014-05-02 11:22:04,6780,2014-05-02 11:22:04
82795,82796,5828,2014-05-03 10:05:25,23,2014-05-03 10:05:27,21,2014-05-03 10:05:27,804,2014-05-03 10:05:27,21,...,3350,2014-05-03 10:05:37,23,2014-05-03 10:05:37,894,2014-05-03 10:05:38,21,2014-05-03 10:05:38,961,2014-05-03 10:05:38
82796,82797,21,2014-11-02 10:46:57,1098,2014-11-02 10:46:57,1098,2014-11-02 10:46:58,1098,2014-11-02 10:47:12,1098,...,1098,2014-11-02 10:47:14,1098,2014-11-02 10:47:15,1098,2014-11-02 10:47:18,1098,2014-11-02 10:47:42,1098,2014-11-02 10:47:47


In [10]:
with open(r"../../data/intruder/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# датафрейм словарика сайтов
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'всего сайтов:', sites_dict.shape[0])
sites_dict.head()

print (sites_dict.index.min())

всего сайтов: 48371
1


## Convert to sites sparse and times

In [166]:
full_sites_cv = TfidfVectorizer(tokenizer=lambda x: [str(el) for el in x], ngram_range=(1, 3), lowercase=False, sublinear_tf=True, max_df=0.6, norm='l2')

In [167]:
train_df_sites = full_sites_cv.fit_transform(train_df[sites].values)

In [168]:
test_df_sites = full_sites_cv.transform(test_df[sites].values)

In [169]:
print (train_df_sites)

  (0, 848826)	0.181188614606
  (0, 836761)	0.176403588498
  (0, 0)	0.479982016485
  (0, 849960)	0.196025359334
  (0, 836762)	0.269644167853
  (0, 1)	0.471864946714
  (0, 849961)	0.290760064542
  (0, 836763)	0.273171629213
  (0, 2)	0.461210497406
  (1, 848826)	0.268072117002
  (1, 836761)	0.260992576814
  (1, 0)	0.380239322662
  (1, 849960)	0.290023372479
  (1, 836762)	0.235622662587
  (1, 1)	0.365234013337
  (1, 849961)	0.254074327388
  (1, 836763)	0.238705057598
  (1, 2)	0.344485723828
  (1, 840076)	0.199373367748
  (1, 850363)	0.332605898554
  (1, 840296)	0.227662220028
  (2, 1043409)	0.378119864476
  (2, 1045108)	0.149641318738
  (2, 1043228)	0.16400925607
  (2, 1044386)	0.114370946745
  :	:
  (253559, 951205)	0.23998157936
  (253559, 605065)	0.23998157936
  (253559, 655748)	0.23998157936
  (253560, 836761)	0.0824240884812
  (253560, 0)	0.152838212889
  (253560, 1)	0.126718396547
  (253560, 2)	0.0771912232282
  (253560, 341683)	0.105889334906
  (253560, 607600)	0.298565768873
  (253

In [170]:
times_train_engineered = pd.DataFrame(index=train_df.index)
times_train_engineered['start_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
times_train_engineered['start_hour'] = train_df['time1'].apply(lambda ts: ts.hour)
times_train_engineered['morning'] = train_df['time1'].apply(lambda ts: int(ts.hour <= 12))
#times_train_engineered['session_length'] = train_df[times].apply(lambda x: (np.max(x) - np.min(x)) / np.timedelta64(1, 's'), axis=1)

In [171]:
#times_train_engineered['lunch_time'] = train_df['time1'].apply(lambda ts: int(ts.hour >= 15 and ts.hour <= 16))
times_train_engineered['weekend'] = train_df['time1'].apply(lambda ts: int(ts.dayofweek == 5 or ts.dayofweek == 6))
#times_train_engineered['mon_tue'] = train_df['time1'].apply(lambda ts: int(ts.dayofweek == 0 or ts.dayofweek == 1))
#times_train_engineered['night'] = train_df['time1'].apply(lambda ts: int(ts.hour >= 22 or ts.hour <= 3))
#times_train_engineered['day'] = train_df['time1'].apply(lambda ts: int(ts.hour >= 13 and ts.hour <= 18))

In [110]:
times_train_engineered.head()

Unnamed: 0,start_month,start_hour,morning,weekend
21668,201301,8,1,1
54842,201301,8,1,1
77291,201301,8,1,1
114020,201301,8,1,1
146669,201301,8,1,1


In [172]:
times_test_engineered = pd.DataFrame(index=test_df.index)
times_test_engineered['start_month'] = test_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
times_test_engineered['start_hour'] = test_df['time1'].apply(lambda ts: ts.hour)
times_test_engineered['morning'] = test_df['time1'].apply(lambda ts: int(ts.hour <= 12))
#times_test_engineered['session_length'] = test_df[times].apply(lambda x: (np.max(x) - np.min(x)) / np.timedelta64(1, 's'), axis=1)

In [173]:
#times_test_engineered['lunch_time'] = test_df['time1'].apply(lambda ts: int(ts.hour >= 15 and ts.hour <= 16))
times_test_engineered['weekend'] = test_df['time1'].apply(lambda ts: int(ts.dayofweek == 5 or ts.dayofweek == 6))
#times_test_engineered['mon_tue'] = test_df['time1'].apply(lambda ts: int(ts.dayofweek == 0 or ts.dayofweek == 1))
#times_test_engineered['night'] = test_df['time1'].apply(lambda ts: int(ts.hour >= 22 or ts.hour <= 3))
#times_test_engineered['day'] = test_df['time1'].apply(lambda ts: int(ts.hour >= 13 and ts.hour <= 18))

In [174]:
times_test_engineered.head()

Unnamed: 0,start_month,start_hour,morning,weekend
0,201410,11,1,1
1,201407,11,1,0
2,201412,15,0,0
3,201411,10,1,0
4,201405,15,0,0


In [175]:
scaler = StandardScaler()
times_train_engineered_scaled = scaler.fit_transform(times_train_engineered)
times_test_engineered_scaled = scaler.transform(times_test_engineered)

In [176]:
full_train = csr_matrix(hstack([times_train_engineered_scaled, train_df_sites]))
full_test = csr_matrix(hstack([times_test_engineered_scaled, test_df_sites]))

In [43]:
full_train.shape

(253561, 1060072)

## Logistic regression

In [177]:
#we're working with time series data so no way we could use CV split
#X_train_sites, X_valid_sites, y_train, y_valid = train_test_split(full_train, y_train_df, test_size = 0.1)
idx = round(full_train.shape[0] * 0.9)
X_train = full_train[:idx, :]
X_valid = full_train[idx:, :]

y_train = y_train_df[:idx]
y_valid = y_train_df[idx:]

In [200]:
%%time

#Cs = [1]
scores = []

Cs = np.linspace(6.5, 8, num=20)
for C in tqdm(Cs):
    scores.append(get_auc_lr_valid(X_train, y_train, C=C))

  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [212]:
#C_best = Cs[np.argmax(scores)]
C_best = 4
lr = LogisticRegression(C=C_best, n_jobs=-1).fit(X_train, y_train)
#br = BaggingClassifier(lr, n_estimators=20, max_samples=0.8, max_features=0.8).fit(X_train, y_train)

In [213]:
print (roc_auc_score(y_train, lr.predict_proba(X_train)[:,1]))
print (roc_auc_score(y_valid, lr.predict_proba(X_valid)[:,1]))

0.996345239352
0.967462163481


In [214]:
y_valid_predictions = lr.predict(X_valid)
confusion_matrix(y_valid, y_valid_predictions)

array([[25083,     9],
       [  252,    12]])

In [215]:
C_best

4

# xgb

In [24]:
import xgboost as xgb

In [28]:
def get_auc_xgb_valid(X, y, max_delta_step, min_child_weight, max_depth, gamma, ratio=0.9):
    # разделим выборку на обучающую и валидационную
    idx = round(X.shape[0] * ratio)

    # обучение классификатора
    xgbc = xgb.XGBClassifier(objective='binary:logistic', n_estimators=150, \
                             max_delta_step=max_delta_step, gamma=gamma, min_child_weight=min_child_weight, \
                             max_depth=max_depth)
    
    xgbc.fit(X[:idx, :], y[:idx], eval_metric='auc')

    # прогноз для валидационной выборки
    y_pred = xgbc.predict_proba(X[idx:, :])[:, 1]
    
    # считаем качество
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [None]:
%%time

scores = []

for max_delta_step in range (3, 7, 1):
    for min_child_weight in range(1, 6, 2):
        for max_depth in range(3, 10, 2):
            new_sc = get_auc_xgb_valid(X_train, y_train, max_delta_step, min_child_weight, max_depth, 0)
                
            print ("delta: %d, min_child_weight: %d, max_depth: %d, gamma: %5.3f --> score: %2.16f" % \
                  (max_delta_step, min_child_weight, max_depth, 0, new_sc))                
                
            scores.append(new_sc)

delta: 3, min_child_weight: 1, max_depth: 3, gamma: 0.000 --> score: 0.9603634872515853
delta: 3, min_child_weight: 1, max_depth: 5, gamma: 0.000 --> score: 0.9609752793521080
delta: 3, min_child_weight: 1, max_depth: 7, gamma: 0.000 --> score: 0.9594595287260371
delta: 3, min_child_weight: 1, max_depth: 9, gamma: 0.000 --> score: 0.9612619539270391
delta: 3, min_child_weight: 3, max_depth: 3, gamma: 0.000 --> score: 0.9596967766501179
delta: 3, min_child_weight: 3, max_depth: 5, gamma: 0.000 --> score: 0.9601756659783548
delta: 3, min_child_weight: 3, max_depth: 7, gamma: 0.000 --> score: 0.9584977227128275
delta: 3, min_child_weight: 3, max_depth: 9, gamma: 0.000 --> score: 0.9584523234187134
delta: 3, min_child_weight: 5, max_depth: 3, gamma: 0.000 --> score: 0.9569614691797372
delta: 3, min_child_weight: 5, max_depth: 5, gamma: 0.000 --> score: 0.9598315100391019
delta: 3, min_child_weight: 5, max_depth: 7, gamma: 0.000 --> score: 0.9607808678587643
delta: 3, min_child_weight: 5, m

In [401]:
xgbc_best = xgb.XGBClassifier(objective='binary:logistic', n_estimators=150, \
    max_delta_step=5, gamma=0, min_child_weight=1, \
    max_depth=5).fit(X_train, y_train, eval_metric='auc')

In [402]:
print (roc_auc_score(y_train, xgbc_best.predict_proba(X_train)[:, 1]))
print (roc_auc_score(y_valid, xgbc_best.predict_proba(X_valid)[:, 1]))

0.992597896517
0.987138693245


In [403]:
y_valid_predictions = xgbc_best.predict(X_valid)
confusion_matrix(y_valid, y_valid_predictions)

array([[25053,    39],
       [  185,    79]])

# Analyze results

In [336]:
def get_insights(indcs, df):
    # actionable df
    act_df = pd.DataFrame(df.iloc[indcs])
    act_df['dow'] = act_df['time1'].apply(lambda ts: int(ts.dayofweek))
    act_df['hour'] = act_df['time1'].apply(lambda ts: int(ts.hour))
    act_df['min'] = act_df[times].min(axis=1)
    act_df['max'] = act_df[times].max(axis=1)
    act_df['seconds'] = (act_df['max'] - act_df['min']) / np.timedelta64(1, 's')

    # top sites
    top_sites = pd.Series(act_df[sites].values.flatten()).value_counts().sort_values(ascending=False).head(10)
    print (sites_dict.ix[top_sites.index])
    
    # day of week and start time
    print ("\ndow, start time")
    print (pd.Series(act_df['dow'].values.flatten()).value_counts().sort_values(ascending=False).head())
    print ()
    print (pd.Series(act_df['hour'].values.flatten()).value_counts().sort_values(ascending=False).head())
    
    # 
    print ("\nsession length")
    print (pd.Series(act_df['seconds'].values.flatten()).describe())

In [337]:
# true positives
get_insights(np.where((y_valid == y_valid_predictions) & (y_valid == 0))[0] + idx, train_df)

                       site
0                       NaN
782          annotathon.org
21            www.google.fr
23           www.google.com
786        www.phylogeny.fr
22          apis.google.com
167            www.bing.com
52      clients1.google.com
29         www.facebook.com
302  ent.univ-bpclermont.fr

dow, start time
4    7084
3    5543
2    4471
0    4272
1    2939
dtype: int64

14    3365
13    2935
11    2769
10    2753
15    2351
dtype: int64

session length
count    24828.000000
mean       151.112293
std        310.956469
min          0.000000
25%          6.000000
50%         31.000000
75%        131.000000
max       1800.000000
dtype: float64


In [338]:
# true negatives
get_insights(np.where((y_valid == y_valid_predictions) & (y_valid == 1))[0] + idx, train_df)

                                     site
77                           i1.ytimg.com
76                        www.youtube.com
75                            s.ytimg.com
78                          yt3.ggpht.com
80                          s.youtube.com
617                         gg.google.com
22                        apis.google.com
81   r4---sn-gxo5uxg-jqbe.googlevideo.com
82   r2---sn-gxo5uxg-jqbe.googlevideo.com
881  r3---sn-gxo5uxg-jqbe.googlevideo.com

dow, start time
0    68
1    39
dtype: int64

16    58
17    39
18     8
15     2
dtype: int64

session length
count     107.000000
mean       64.130841
std       165.581484
min         1.000000
25%         6.000000
50%        12.000000
75%        59.000000
max      1522.000000
dtype: float64


In [347]:
# false negatives : true value 0, predicted 1
get_insights(np.where((y_valid != y_valid_predictions) & (y_valid == 0))[0] + idx, train_df)

                                      site
77                            i1.ytimg.com
76                         www.youtube.com
617                          gg.google.com
80                           s.youtube.com
81    r4---sn-gxo5uxg-jqbe.googlevideo.com
879   r1---sn-gxo5uxg-jqbe.googlevideo.com
82    r2---sn-gxo5uxg-jqbe.googlevideo.com
1307              www.youtube-nocookie.com
881   r3---sn-gxo5uxg-jqbe.googlevideo.com
78                           yt3.ggpht.com

dow, start time
1    9
dtype: int64

17    5
16    4
dtype: int64

session length
count      9.000000
mean     179.777778
std      147.420129
min        3.000000
25%       55.000000
50%      184.000000
75%      267.000000
max      461.000000
dtype: float64


In [346]:
# false positives: true value 1, predicted 0
get_insights(np.where((y_valid != y_valid_predictions) & (y_valid == 1))[0] + idx, train_df)

                                       site
77                             i1.ytimg.com
76                          www.youtube.com
75                              s.ytimg.com
37                              twitter.com
22                          apis.google.com
29                         www.facebook.com
78                            yt3.ggpht.com
80                            s.youtube.com
881    r3---sn-gxo5uxg-jqbe.googlevideo.com
25383               www.cjn.justice.gouv.fr

dow, start time
0    156
1     84
dtype: int64

17    114
16    106
18     14
15      6
dtype: int64

session length
count     240.000000
mean       42.725000
std       124.057448
min         0.000000
25%         5.000000
50%        11.500000
75%        40.000000
max      1522.000000
dtype: float64


## Prediction

In [216]:
lr_best = LogisticRegression(C=C_best).fit(full_train, y_train_df)

In [154]:
br_best = br.fit(full_train, y_train_df)

In [404]:
xgbc_best = xgb.XGBClassifier(objective='binary:logistic', n_estimators=150, \
    max_delta_step=5, gamma=0, min_child_weight=1, max_depth=5).fit(full_train, y_train_df, eval_metric='auc')

In [217]:
y_pred = lr_best.predict_proba(full_test)[:, 1]
#y_pred = xgbc_best.predict_proba(full_test)[:, 1]
write_to_submission_file(y_pred, 'baseline1.csv')