In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def load_origin_data():
    black_url = []
    with open('data/anomal_url.txt') as f:
        black_url = [url.strip() for url in f.readlines()]
    white_url = []
    with open('data/normal_traing_url.txt') as f:
        white_url = [url.strip() for url in f.readlines()]
    return black_url, white_url

In [3]:
black_url, white_url = load_origin_data()

In [4]:
len(black_url), len(white_url)

(25065, 36000)

In [5]:
black_url[1]

'http://localhost:8080/tienda1/publico/anadir.jsp'

In [6]:
white_url[1]

'http://localhost:8080/tienda1/publico/anadir.jsp?id=3&nombre=Vino+Rioja&precio=100&cantidad=55&B1=A%F1adir+al+carrito'

In [10]:
def load_data(black_url, white_url):
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    
    texts = black_url + white_url
    tokenizer = Tokenizer(char_level = True)
    tokenizer.fit_on_texts(texts)
    word_index = tokenizer.word_index
    
    X = tokenizer.texts_to_sequences(white_url + black_url)
    X = pad_sequences(X, maxlen = 200)
    y = np.array([0] * len(white_url) + [1] * len(black_url))
    
    from sklearn.model_selection import train_test_split
    return train_test_split(X, y) + [word_index]

In [11]:
X_train, X_test, y_train, y_test, word_index = load_data(black_url, white_url)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((45798, 200), (15267, 200), (45798,), (15267,))

In [13]:
len(word_index)

48

In [22]:
def build_model(word_num):
    from keras.models import Sequential
    from keras.layers import Activation, BatchNormalization
    from keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
    from keras.layers.embeddings import Embedding
    from keras.layers.wrappers import Bidirectional
    
    model = Sequential()
    model.add(Embedding(word_num, 100, input_length = 200))
    model.add(Bidirectional(LSTM(64, return_sequences = True, dropout = 0.3, recurrent_dropout = 0.3)))
    model.add(Conv1D(filters = 128, kernel_size = 3, padding = 'valid', activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(4))
    model.add(Flatten())
    
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(BatchNormalization())
    model.add(Activation('sigmoid'))
    
    from keras import backend as K
    def precision(y_true, y_pred):
        true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        pred_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision_score = true_pos / (pred_pos + K.epsilon())
        return precision_score
    
    def recall(y_true, y_pred):
        true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        all_pos = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall_score = true_pos / (all_pos + K.epsilon())
        return recall_score
    
    def f1(y_true, y_pred):
        precision_score = precision(y_true, y_pred)
        recall_score = recall(y_true, y_pred)
        f1_score = 2 * ((precision_score * recall_score) / (precision_score + recall_score))
        return f1_score
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', 
                  metrics = [precision, recall, f1])
    return model

In [23]:
model = build_model(len(word_index) + 1)

In [24]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 100)          4900      
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200, 128)          84480     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 198, 128)          49280     
_________________________________________________________________
batch_normalization_4 (Batch (None, 198, 128)          512       
_________________________________________________________________
activation_4 (Activation)    (None, 198, 128)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 49, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6272)             

In [25]:
from keras.utils import plot_model

In [29]:
#plot_model(model, to_file = 'blstm-cnn.png', show_shapes = True)

In [30]:
def train(model, X_train, y_train):
    from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
    early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10)
    model_checkpoint = ModelCheckpoint('model-blstm-cnn.h5', save_best_only = True, save_weights_only = True)
    tensor_board = TensorBoard('tflog-blstm-cnn', write_graph = True, write_images = True)
    
    model.fit(X_train, y_train, epochs = 3, batch_size = 64, validation_split = 0.2, shuffle = True,
             callbacks = [early_stopping, model_checkpoint, tensor_board])

In [None]:
train(model, X_train, y_train)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 36638 samples, validate on 9160 samples
Epoch 1/3