In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "2"

import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed
from keras.layers.normalization import BatchNormalization
from tqdm import tqdm_notebook, tqdm
from IPython.display import clear_output, display
from keras import backend as K

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, MinMaxScaler
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

from scipy import stats
from glob import glob
from multiprocessing import Pool
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
import xgboost

In [79]:
data = pd.read_csv('train_labels.csv')
data['train'] = 1

In [80]:
Xshape = pd.read_csv('/mnt/kaspersky/data_kasp/train/0_train.csv').shape
Xshape

(95999, 56)

In [68]:
batch_size, max_len = 64, 256
steps_ahead = 10

model = Sequential()
model.add(LSTM(16, batch_input_shape=(batch_size, max_len, Xshape[1]-1), return_sequences=True,
               #dropout=0.2, recurrent_dropout=0.3)
         ))
model.add(TimeDistributed(Dense(16, activation='relu')))
model.add(TimeDistributed(Dense(Xshape[1]-1, activation='linear')))
# optimizer = keras.optimizers.Adam()
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mse'])

In [12]:
files = ['/mnt/kaspersky/data_kasp/train/'+f for f in data.SeriesId][:]
files_read = [ pd.read_csv(f, header=None) for f in files]

In [67]:
def batch_generator(kind, max_len=max_len, batch_size=batch_size, steps_ahead=steps_ahead):
    assert steps_ahead >= 1
    
    if kind == 'train': data = files_read[itr[np.random.randint(0,len(itr))]].loc[:, 1:]
    elif kind =='test': data = files_read[ite[np.random.randint(0,len(ite))]].loc[:, 1:]
    else: ValueError
        
    scaler = MinMaxScaler().fit(data)
    
    for i in range(int(len(data)/batch_size/max_len)):
        
        if i % 100 == 0:
            ids = np.arange(len(data) - max_len - steps_ahead - 1)
            np.random.shuffle(ids)
            ids = ids[:batch_size]
            if i > 0: raise StopIteration
        
        X = np.stack([scaler.transform(data.values[i:i+max_len, :]) for i in ids])
        y = np.stack([scaler.transform(data.values[i+steps_ahead:i+max_len+steps_ahead, :]) for i in ids])
        
        ids = (ids + max_len) % (len(data) - max_len - steps_ahead - 1)
        #model.reset_states()
        
        yield X, y
    
    raise StopIteration

In [21]:
files = ['/mnt/kaspersky/data_kasp/train/'+f for f in data.SeriesId][:]

y = data
skf = StratifiedKFold(5, shuffle=True, random_state=0)
for itr, ite in skf.split(y[y['train'] == 1]['SeriesId'].values, y[y['train'] == 1]['Attack'].values): break

In [32]:
files_train = [files[i] for i in itr]
files_test = [files[i] for i in ite]

In [71]:
# графопостроитель ошибок
def plot_losses(d, alpha=0.1):
    clear_output()
    plt.figure(figsize=(15,6))
    
    for loss, name in d:
        pd.DataFrame(loss)[0].ewm(alpha=alpha).mean().plot(label=name)
    plt.yscale('log')
    
    plt.legend()
    plt.show()

In [None]:
nb_epoch = 500
i = 0
delimeters = []
train_losses = []
test_losses, test_losses2 = [], []
for n in range(len(delimeters), nb_epoch):
    test_gen = batch_generator('test', max_len, batch_size)
    train_gen = batch_generator('train', max_len, batch_size)
    
    for batch_train, batch_y in train_gen:
        i += 1
        loss = model.train_on_batch(batch_train, batch_y)
        if (i % 10 == 0):
            
            train_losses.append(loss)
            
            batch_test, batch_test_y = next(iter(test_gen))
            loss = model.test_on_batch(batch_test, batch_test_y)
            test_losses.append(loss)
            
            if i % 10 == 0:
                plot_losses([(train_losses, 'train'), (test_losses, 'test')], 0.03)
                
    delimeters += [(len(train_losses), len(test_losses))]

In [40]:
batch_train.shape, batch_y.shape

((64, 256, 55), (64, 256, 55))