In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed
from keras.layers.normalization import BatchNormalization
from tqdm import tqdm_notebook, tqdm
from IPython.display import clear_output, display
from keras import backend as K

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

from scipy import stats
from scipy.stats.mstats import gmean
from glob import glob
from multiprocessing import Pool
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
import xgboost

# Read data

In [211]:
data = pd.read_csv('./train_labels.csv')
data['train'] = 1

In [114]:
def get_percentile_features(data, axis=0):
    top_value = np.percentile(data, 10, axis=0)
    data_slice = data[data >= top_value]
    return list(np.std(data_slice, axis=0))

In [214]:
%%time
functions = [np.max, np.min, np.std, np.mean, np.median, 
#              lambda x, axis: np.percentile(x, 95, axis=0),  lambda x, axis: np.percentile(x, 5, axis=0),  
#              lambda x, axis: np.percentile(x, 25, axis=0),  lambda x, axis: np.percentile(x, 75, axis=0),
#              lambda x, axis: x.values.argmax(axis=0),  lambda x, axis: x.values.argmin(axis=0), 
#             get_percentile_features
            ]

def calc_feats(filename):
    #data = np.load(filename)
    data = pd.read_csv(filename, header=None).drop([0], axis=1)
    v = []
    for func in functions:
        v.extend(list(func(data, axis=0)))
    
    derivatives = (data - data.drop(0).reset_index().drop("index", axis=1)).dropna()
    for func in functions:
        v.extend(list(func(derivatives, axis=0)))

#     data = data - data.ewm(alpha=0.1).mean().shift() 
#     for func in functions:
#         v.extend(list(func(data, axis=0)))
    return v

files = ['/mnt/kaspersky/data_kasp/train/'+f for f in data.SeriesId][:]

pool = Pool(16)
X = None
try: X = pd.DataFrame(pool.map(calc_feats, files))
except ValueError as error: print(error)
pool.terminate()
X.fillna(-999, inplace=True)
y = data.copy()[:len(X)]
print X.shape

(500, 550)
CPU times: user 424 ms, sys: 476 ms, total: 900 ms
Wall time: 1min 35s


# XGB

In [213]:
param = {}
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['max_depth'] = 7
# param['booster'] = 'dart'
param['booster'] = 'gblinear'
param['eta'] = 0.01
# param['subsample'] = 0.9
# param["scale_pos_weight"] = 0.5
param['colsample_bytree'] = 0.1
param['nthread'] = 8
param['alpha'] = 0.00
param['lambda_bias'] = 0.1
# param['lambda'] = 0
# param['min_child_weight'] = 5

# if param['booster'] == 'gblinear': param['eta'] *= 1e-4
numround = 15001
# scores, , []

skf = StratifiedKFold(5, shuffle=True, random_state=0)
sc,sc2,sc_mean = [],[],[]
pred_train1 = np.zeros(len(y))
pred_train2 = np.zeros(len(y))

xgbs = []
ets = []

for itr, ite in skf.split(y[y['train'] == 1]['SeriesId'].values, y[y['train'] == 1]['Attack'].values):

    ypred,ypred2 = [],[]
    for i in range(1):
        param['seed'] = i+1

        #xgboost
        Xdatatrain = xgboost.DMatrix(data=X.iloc[itr].values,
                                     label=y.iloc[itr]['Attack'].values)
        Xdataval = xgboost.DMatrix(data=X.iloc[ite].values,
                                    label=y.iloc[ite]['Attack'].values)

        plst = list(param.items())
        watchlist = [(Xdatatrain, 'train'), (Xdataval, 'eval')]
        bst = xgboost.train(plst, Xdatatrain, 100000, evals=watchlist, verbose_eval=1000, 
                            early_stopping_rounds=int(5 / param['eta'] ** 0.5))

        ypred.append(bst.predict(Xdataval))
        xgbs.append(bst)
        
        # extra trees
        clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=16, criterion='entropy',
                                  max_features=0.15, min_samples_split=5)
        clf.fit(X.iloc[itr].values,y.iloc[itr]['Attack'].values)
        ypred2.append(clf.predict_proba(X.iloc[ite].values)[:,1])
        ets.append(clf)
        
    ypred = sum(ypred) / len(ypred)
    ypred2 = sum(ypred2) / len(ypred2)
    pred_train1[ite] = ypred
    pred_train2[ite] = ypred2
    
    sc.append(auc(y.iloc[ite]['Attack'].values, ypred))
    sc2.append(auc(y.iloc[ite]['Attack'].values, ypred2))
    sc_mean.append(auc(y.iloc[ite]['Attack'].values, (ypred2 + ypred) / 2))
#     break

print('XGB: {:.3f} +- {:.3f}'.format(np.mean(sc), np.std(sc)))
print('ET: {:.3f} +- {:.3f}'.format(np.mean(sc2), np.std(sc2)))
print('MEAN: {:.3f} +- {:.3f}'.format(np.mean(sc_mean), np.std(sc_mean)))

[0]	train-auc:0.833119	eval-auc:0.657229
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[356]	train-auc:0.97603	eval-auc:0.940369

[0]	train-auc:0.841431	eval-auc:0.78121
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[89]	train-auc:0.959352	eval-auc:0.885886

[0]	train-auc:0.837731	eval-auc:0.766624
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[108]	train-auc:0.96021	eval-auc:0.937795

[0]	train-auc:0.830009	eval-auc:0.763192
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[223]	train-auc:0.977102	eval-auc:0.9300

##### top score (yet)
    XGB: 0.923 +- 0.021
    ET: 0.938 +- 0.017
    MEAN: 0.957 +- 0.009
    
    XGB: 0.921 +- 0.021
    ET: 0.937 +- 0.017
    MEAN: 0.948 +- 0.017

# PLOTTING

In [106]:
# fnames = [item for sublist in [["%d_max" % i, "%d_min" % i, "%d_std" % i, "%d_mean" % i, "%d_median" % i, 
# #                                 "%d_argmax" % i, "%d_argmin"
#                                ] for i in range(56)] for item in sublist]
# fnames = fnames + [item + "_der" for item in fnames]
# print (len(fnames))

In [104]:
# ###### plot 
# import operator
# importance = xgbs[0].get_fscore()
# importance = sorted(importance.items(), key=operator.itemgetter(1))

# df = pd.DataFrame(importance, columns=['feature', 'fscore'])
# df.feature = [fnames[int(item[1:])] for item in df.feature]
# df['fscore'] = df['fscore'] / df['fscore'].sum()
# df = df.loc[:50]
# plt.figure()
# df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(13,30))
# plt.title('XGBoost Feature Importance')
# plt.xlabel('relative importance')
# plt.show()

In [None]:
# plt.figure(figsize=(40,40))
# sns.heatmap(X.corr())

In [None]:
# sns.regplot(pred_train1, pred_train2)

# XGB PREDICT

In [155]:
# predictions name

In [203]:
subm = pd.read_csv('./sample_submission.csv')

In [204]:
%%time
files = ['/mnt/kaspersky/data_kasp/test/'+f for f in subm.SeriesId][:]

pool = Pool(16)
X_test = None
try: X_test = pd.DataFrame(pool.map(calc_feats, files))
except ValueError as error: print(error)
pool.terminate()
X_test.fillna(-999, inplace=True)
y_test = subm.copy()[:len(X_test)]
print X_test.shape
X_test.columns = ["f%d" % item for item in X_test.columns]

(500, 770)
CPU times: user 828 ms, sys: 384 ms, total: 1.21 s
Wall time: 3min 32s


# XGB predictions

In [205]:
pred_name = "best_4_knn_2"

In [206]:
Xdatatrain = xgboost.DMatrix(data=X_test)

In [207]:
preds = []
for model in xgbs:
    preds.append(model.predict(Xdatatrain))
mean_preds = sum(preds) / len(preds)
mean_preds = mean_preds

In [208]:
# !mkdir submissions
subm.Attack = mean_preds
subm.Attack = subm.Attack.map(lambda x: np.round(x, 3))
subm.to_csv("./submissions/xgb_%s.csv" % pred_name, index=False)

# Extra trees predictions

In [209]:
preds_et = []
for model in ets:
    preds_et.append(model.predict_proba(X_test))
mean_preds_et = sum(preds_et) / len(preds_et)
# mean_preds_et = gmean(preds_et)
mean_preds_et = mean_preds_et[:, 1]

In [210]:
subm.Attack = mean_preds_et
subm.Attack = subm.Attack.map(lambda x: np.round(x, 3))
subm.to_csv("./submissions/et_%s.csv" % pred_name, index=False)