In [73]:
MAX_SEQUENCE_LENGTH = 200 # 句子 上限200个词
EMBEDDING_DIM = 100 # 100d 词向量

In [74]:
good = []
bad = []
for line in open('data/goodqueries.txt'):
    good.append(line.strip('\n'))
for line in open('data/badqueries.txt'):
    bad.append(line.strip('\n'))
print('good len:', len(good))
print('bad len:', len(bad))

good len: 1294531
bad len: 48126


In [75]:
data = []
labels = []

length = len(bad)
scale = 3
data.extend(good[:length * scale]) # 只取部分数据
data.extend(bad)
labels.extend([0] * length * scale)
labels.extend([1] * length)
print('data:', len(data))
print(data[0], data[-1])

data: 192504
/103886/ <svg onload=location='//p0.al'>


In [76]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

texts = data

tokenizer = Tokenizer(char_level = True)
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(data)

data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [77]:
print(len(word_index))
print(data.shape)

147
(192504, 200)


In [78]:
X = data
y = np.array(labels)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [79]:
X_train.shape, X_test.shape

((154003, 200), (38501, 200))

In [80]:
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional

In [81]:
QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

In [82]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length = MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences = False, 
                             dropout = DROPOUT_RATE, recurrent_dropout = DROPOUT_RATE)))
model.add(Dense(64))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(BatchNormalization())
model.add(Activation("sigmoid"))

In [83]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 100)          14800     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               84480     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
batch_normalization_5 (Batch (None, 64)                256       
_________________________________________________________________
activation_5 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
_________________________________________________________________
batch_normalization_6 (Batch (None, 1)                

In [84]:
from keras.utils import plot_model
from IPython import display

In [86]:
#plot_model(model, to_file = 'blstm.png', show_shapes = True)

In [94]:
from keras import backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1(y_true, y_pred):
    precision_score = precision(y_true, y_pred)
    recall_score = recall(y_true, y_pred)
    return 2 * ((precision_score * recall_score) / (precision_score + recall_score))

In [95]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

In [96]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10)
model_checkpoint = ModelCheckpoint('model-blstm.h5', save_best_only = True, save_weights_only = True)
tensor_board = TensorBoard('tflog-blstm', write_graph = True, write_images = True)

In [97]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = [precision, recall, f1])

In [None]:
model.fit(X_train, y_train, epochs = 3, batch_size = 64,
         validation_split = 0.3, shuffle = True,
         callbacks = [early_stopping, model_checkpoint, tensor_board])

Train on 107802 samples, validate on 46201 samples


Epoch 1/3
  1792/107802 [..............................] - ETA: 46:10 - loss: 0.5441 - precision: 0.5306 - recall: 0.8912 - f1: 0.6556