In [1]:
import numpy as np
# 定义文档集
documents = ['Well done!',
             'Good work',
             'Great effort',
             'nice work',
             'Excetllent!',
             'Weak',
             'Poor effort!',
             'not good',
             'poor work',
             'Could have done better.']

# 定义类别标记
labels = np.array([1,1,1,1,1,0,0,0,0,0])

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
# 实例化Tokenizer对象
tokenizer = Tokenizer()
# 构建词汇表
tokenizer.fit_on_texts(documents)
# 输出词汇表长度
vocab_size = len(tokenizer.word_index) + 1
# 向量化文本集
encodeDocuments = tokenizer.texts_to_sequences(documents)
# 固定文本长度
max_length = 4
paddedDocuments = pad_sequences(encodeDocuments, maxlen=max_length, padding='post')
print(paddedDocuments)
# 训练集
x_train = paddedDocuments[:8]
y_train = labels[:8]
# 测试集
x_test = paddedDocuments[-2:]
y_test = labels[-2:]

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [4]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers.core import Flatten
from keras.layers.core import Dropout

In [5]:
model = Sequential()
model.add(Embedding(vocab_size,
                    100,
                    input_length=max_length, 
                    trainable=True))
# 加入卷积层
model.add(Conv1D(filters=100,       # 卷积核数量，卷积层输出维度1,则卷积后的文本矩阵的列数为1
                 kernel_size=2,   # 卷积核长度为2个单词
                 strides=1,       # 卷积核每次移动单词数量1
                 padding="same",  # 不填充，则输出的卷积后的文本矩阵的行数为(4-2+2*0)/1+1=3
                 activation='relu'))
# 加入池化层
model.add(MaxPooling1D(pool_size=2))
# 加入平坦层，使得维数匹配输出层
model.add(Flatten())
# 输出层
model.add(Dense(1, activation='sigmoid'))
model.summary()



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4, 100)            20100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2, 100)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 21,801
Trainable params: 21,801
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
 # 训练
train_history = model.fit(x_train, y_train,
                          batch_size=10, 
                          epochs=100,  
                          verbose=2)  

Epoch 1/100
 - 0s - loss: 0.6891 - acc: 0.6250
Epoch 2/100
 - 0s - loss: 0.6584 - acc: 0.6250
Epoch 3/100
 - 0s - loss: 0.6384 - acc: 0.6250
Epoch 4/100
 - 0s - loss: 0.6221 - acc: 0.6250
Epoch 5/100
 - 0s - loss: 0.6063 - acc: 0.6250
Epoch 6/100
 - 0s - loss: 0.5914 - acc: 0.6250
Epoch 7/100
 - 0s - loss: 0.5767 - acc: 0.7500
Epoch 8/100
 - 0s - loss: 0.5627 - acc: 0.7500
Epoch 9/100
 - 0s - loss: 0.5482 - acc: 0.7500
Epoch 10/100
 - 0s - loss: 0.5333 - acc: 0.7500
Epoch 11/100
 - 0s - loss: 0.5182 - acc: 0.7500
Epoch 12/100
 - 0s - loss: 0.5026 - acc: 0.7500
Epoch 13/100
 - 0s - loss: 0.4866 - acc: 0.8750
Epoch 14/100
 - 0s - loss: 0.4708 - acc: 1.0000
Epoch 15/100
 - 0s - loss: 0.4545 - acc: 1.0000
Epoch 16/100
 - 0s - loss: 0.4386 - acc: 1.0000
Epoch 17/100
 - 0s - loss: 0.4227 - acc: 1.0000
Epoch 18/100
 - 0s - loss: 0.4066 - acc: 1.0000
Epoch 19/100
 - 0s - loss: 0.3900 - acc: 1.0000
Epoch 20/100
 - 0s - loss: 0.3738 - acc: 1.0000
Epoch 21/100
 - 0s - loss: 0.3578 - acc: 1.0000
E

注意这个示例中没有分训练集和验证集，因为数据量太少了。如果分成训练集和验证集，则验证集的准确率为0。也说明利用卷积神经网络抽取特征，需要有足够的训练数据才可以。

In [8]:
# 评估
loss, accuracy = model.evaluate(paddedDocuments, labels, verbose=0)
print('Accuracy: %f' % (accuracy * 100))

Accuracy: 80.000001
