In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
# os.environ['CUDA_VISIBLE_DEVICES'] = "3"

import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed
from keras.layers.normalization import BatchNormalization
from tqdm import tqdm_notebook, tqdm
from IPython.display import clear_output, display
from keras import backend as K

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

from scipy.fftpack import ifftn, fftn

from scipy import stats
from glob import glob
from multiprocessing import Pool
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
import xgboost

In [3]:
data = pd.read_csv('train_labels.csv')
data['train'] = 1

In [8]:
functions = [np.max, np.min, np.std, np.mean, np.median, fftn]
n_size = 5


def calc_feats(filename):
    #data = np.load(filename)
    data = pd.read_csv(filename, header=None)
    v = []
    for func in functions:
        if func == fftn:
            d = fftn(data, shape=(n_size,), axes=(0,))
            d = np.real(d)
            d = d.ravel()
            v.extend(list(d))
        else:
            v.extend(list(func(data, axis=0)))
    return v

files = ['/mnt/kaspersky/data_kasp/train/'+f for f in data.SeriesId][:]

pool = Pool(16)
X = None
try: X = pd.DataFrame(pool.map(calc_feats, files))
except ValueError as error: print(error)
pool.terminate()
X.fillna(-999,inplace=True)
y = data.copy()[:len(X)]
print X.shape

(500, 560)


In [9]:
param = {}
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'

param['max_depth'] = 7

param['booster'] = 'gbtree'
# param['booster'] = 'gblinear'
param['eta'] = 0.005
# param['subsample'] = 0.6
param['colsample_bytree'] = 0.3
param['nthread'] = 8
#param['alpha'] = 0
param['min_child_weight'] = 1

# if param['booster'] == 'gblinear': param['eta'] *= 1e-4
numround = 15001
# scores, , []

skf = StratifiedKFold(5, shuffle=True, random_state=0)
sc,sc2 = [],[]
pred_train1 = np.zeros(len(y))
pred_train2 = np.zeros(len(y))
for itr, ite in skf.split(y[y['train'] == 1]['SeriesId'].values, y[y['train'] == 1]['Attack'].values):

    ypred,ypred2 = [],[]
    for i in range(1):
        param['seed'] = i+1

        Xdatatrain = xgboost.DMatrix(data = X.iloc[itr].values,
                                     label = y.iloc[itr]['Attack'].values)
        Xdataval = xgboost.DMatrix(data = X.iloc[ite].values,
                                    label = y.iloc[ite]['Attack'].values)

        plst = list(param.items())
        watchlist = [(Xdatatrain, 'train'), (Xdataval, 'eval')]
        bst = xgboost.train(plst, Xdatatrain, 100000, evals=watchlist, verbose_eval=1000, 
                            early_stopping_rounds=int(5 / param['eta'] ** 0.5))

        ypred.append(bst.predict(Xdataval))
        
        clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=16, criterion='entropy',
                                  max_features=0.1, min_samples_split=10)
        #clf = LogisticRegression(C=1)
        clf.fit(X.iloc[itr].values,y.iloc[itr]['Attack'].values)
        ypred2.append(clf.predict_proba(X.iloc[ite].values)[:,1])

    ypred = sum(ypred)/len(ypred)
    ypred2 = sum(ypred2)/len(ypred2)
    pred_train1[ite] = ypred
    pred_train2[ite] = ypred2
    
    sc.append(auc(y.iloc[ite]['Attack'].values, ypred))
    sc2.append(auc(y.iloc[ite]['Attack'].values, ypred2))
#     break

print( 'XGB: {:.3f} +- {:.3f}'.format(np.mean(sc), np.std(sc)))
print('ET: {:.3f} +- {:.3f}'.format(np.mean(sc2), np.std(sc2)))

[0]	train-auc:0.993136	eval-auc:0.798584
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 70 rounds.
Stopping. Best iteration:
[119]	train-auc:1	eval-auc:0.853282

[0]	train-auc:0.960586	eval-auc:0.834191
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 70 rounds.
Stopping. Best iteration:
[4]	train-auc:0.998043	eval-auc:0.908408

[0]	train-auc:0.982572	eval-auc:0.779923
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 70 rounds.
Stopping. Best iteration:
[195]	train-auc:1	eval-auc:0.871729

[0]	train-auc:0.982384	eval-auc:0.826684
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 70 rounds.
Stopping. Best iteration:
[123]	train-auc:1	eval-auc:0.9142

[0]	train-auc:0.9