In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed
from keras.layers.normalization import BatchNormalization
from tqdm import tqdm_notebook, tqdm
from IPython.display import clear_output, display
from keras import backend as K

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from scipy import stats
from scipy.stats.mstats import gmean
from glob import glob
from multiprocessing import Pool
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
import xgboost

# Read data

In [3]:
data = pd.read_csv('./train_labels.csv')
data['train'] = 1

In [4]:
%%time
functions = [np.max, np.min, np.std, np.mean, np.median, 
#              lambda x, axis: np.percentile(x, 95, axis=0),  lambda x, axis: np.percentile(x, 75, axis=0),  
#              lambda x, axis: np.percentile(x, 25, axis=0),  lambda x, axis: np.percentile(x, 5, axis=0),
#              lambda x, axis: x.values.argmax(axis=0),  lambda x, axis: x.values.argmin(axis=0), 
#             get_percentile_features
            ]

def calc_feats(filename):
    #data = np.load(filename)
    data = pd.read_csv(filename, header=None).drop([0], axis=1)
    v = []
    for func in functions:
        v.extend(list(func(data, axis=0)))
    derivatives = (data - data.drop(0).reset_index().drop("index", axis=1)).dropna()
    for func in functions:
        v.extend(list(func(derivatives, axis=0)))
    return v

files = ['/mnt/kaspersky/data_kasp/train/'+f for f in data.SeriesId][:]

pool = Pool(16)
X = None
try: X = pd.DataFrame(pool.map(calc_feats, files))
except ValueError as error: print(error)
pool.terminate()
X.fillna(-999, inplace=True)
y = data.copy()[:len(X)]
print X.shape

(500, 550)
CPU times: user 284 ms, sys: 92 ms, total: 376 ms
Wall time: 1min 17s


In [5]:
x_copy = X.copy()

In [13]:
scaler = StandardScaler()
X = scaler.fit_transform(x_copy)

# KNN

In [7]:
skf = StratifiedKFold(5, shuffle=True, random_state=0)
sc,sc2,sc_mean = [],[],[]
pred_train1 = np.zeros(len(y))
pred_train2 = np.zeros(len(y))

knns = []

for itr, ite in skf.split(y[y['train'] == 1]['SeriesId'].values, y[y['train'] == 1]['Attack'].values):

    ypred,ypred2 = [],[]
    for i in range(1):
        
        # knn
        clf = KNeighborsClassifier(n_neighbors=400, weights='distance', n_jobs=20, p=20)
        clf.fit(X[itr],y.iloc[itr]['Attack'])
        ypred.append(clf.predict_proba(X[ite])[:,1])
        knns.append(clf)
        
    ypred = sum(ypred) / len(ypred)
    pred_train1[ite] = ypred
    
    sc.append(auc(y.iloc[ite]['Attack'].values, ypred))
#     break

print('KNN: {:.3f} +- {:.3f}'.format(np.mean(sc), np.std(sc)))

KNN: 0.898 +- 0.022


In [8]:
xgbpred = pd.read_csv('submissions/xgb_out_of_fold.csv').values.ravel()
extpred = pd.read_csv('submissions/et_out_of_fold.csv').values.ravel()
# w = 0.5
auc(y.Attack.values, (xgbpred + pred_train1 + extpred) * 1./3)

0.95797511797511803

In [None]:
plt.plot(ypred)

##### top score (yet)
    XGB: 0.923 +- 0.021
    ET: 0.938 +- 0.017
    MEAN: 0.957 +- 0.009
    KNN: 0.898 +- 0.022
    BEST MIX: 0.95797511797511803

# PLOTTING

In [106]:
# fnames = [item for sublist in [["%d_max" % i, "%d_min" % i, "%d_std" % i, "%d_mean" % i, "%d_median" % i, 
# #                                 "%d_argmax" % i, "%d_argmin"
#                                ] for i in range(56)] for item in sublist]
# fnames = fnames + [item + "_der" for item in fnames]
# print (len(fnames))

In [104]:
# ###### plot 
# import operator
# importance = xgbs[0].get_fscore()
# importance = sorted(importance.items(), key=operator.itemgetter(1))

# df = pd.DataFrame(importance, columns=['feature', 'fscore'])
# df.feature = [fnames[int(item[1:])] for item in df.feature]
# df['fscore'] = df['fscore'] / df['fscore'].sum()
# df = df.loc[:50]
# plt.figure()
# df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(13,30))
# plt.title('XGBoost Feature Importance')
# plt.xlabel('relative importance')
# plt.show()

In [None]:
# plt.figure(figsize=(40,40))
# sns.heatmap(X.corr())

In [None]:
# sns.regplot(pred_train1, pred_train2)

# XGB PREDICT

In [155]:
# predictions name
# pred_name = ""

In [9]:
subm = pd.read_csv('./sample_submission.csv')

In [10]:
%%time
files = ['/mnt/kaspersky/data_kasp/test/'+f for f in subm.SeriesId][:]

pool = Pool(16)
X_test = None
try: X_test = pd.DataFrame(pool.map(calc_feats, files))
except ValueError as error: print(error)
pool.terminate()
X_test.fillna(-999, inplace=True)
y_test = subm.copy()[:len(X_test)]
print X_test.shape

(500, 550)
CPU times: user 292 ms, sys: 104 ms, total: 396 ms
Wall time: 1min 19s


In [11]:
x_test_copy = X_test.copy()

# KNN

In [14]:
X_test = scaler.transform(x_test_copy)

In [15]:
preds_et = []
for model in knns:
    preds_et.append(model.predict_proba(X_test))
mean_preds_et = sum(preds_et) / len(preds_et)
mean_preds_et = mean_preds_et[:, 1]

In [18]:
subm.Attack = mean_preds_et
subm.Attack = subm.Attack.map(lambda x: np.round(x, 3))
subm.to_csv("./submissions/knn_best.csv", index=False)

# knn with et xgb

In [45]:
knn_preds = pd.read_csv("./submissions/knn_best.csv").Attack
xgb_preds = pd.read_csv("./submissions/xgb_best_4_knn.csv").Attack
et_preds = pd.read_csv("./submissions/et_window_250_lstm.csv").Attack

In [46]:
pd.DataFrame(list(zip(knn_preds[:10], xgb_preds[:10], et_preds[:10], ((knn_preds + et_preds + xgb_preds)* 1./ 3)[:10])))

Unnamed: 0,0,1,2,3
0,0.37,0.856,0.628,0.618
1,0.239,0.055,0.001,0.098333
2,0.247,0.007,0.003,0.085667
3,0.051,0.126,0.18,0.119
4,0.054,0.277,0.205,0.178667
5,0.263,0.783,0.81,0.618667
6,0.263,0.248,0.358,0.289667
7,0.179,0.031,0.002,0.070667
8,0.295,0.0,0.002,0.099
9,0.371,1.0,0.857,0.742667


In [33]:
# subm.Attack = (knn_preds + et_preds + xgb_preds) * 1./3
# subm.Attack = 1-subm.Attack.map(lambda x: np.round(x, 3))
# subm.to_csv("./submissions/knn_xgb_et_2.csv", index=False)


# RANKS

In [47]:
from scipy.stats import rankdata

In [48]:
knn_ranks = rankdata(knn_preds)
xgb_ranks = rankdata(xgb_preds)
et_ranks = rankdata(et_preds)

In [49]:
final_preds = (knn_ranks + xgb_ranks + et_ranks) / 1500

In [50]:
subm.Attack = final_preds
subm.Attack = subm.Attack.map(lambda x: np.round(x, 3))
subm.to_csv("./submissions/knn_xgb_et_RANKS_FINAL_002.csv", index=False)