# 使用GloVe预训练词向量模型的情感分类

In [1]:
import numpy as np
# 定义文档集
documents = ['Well done!',
             'Good work',
             'Great effort',
             'nice work',
             'Excetllent!',
             'Weak',
             'Poor effort!',
             'not good',
             'poor work',
             'Could have done better.']

# 定义类别标记
labels = np.array([1,1,1,1,1,0,0,0,0,0])

## texts_to_sequences索引单词

由于本例使用的是预训练词向量模型作为词嵌入层参数矩阵，因此使用Tokenizer实例来索引单词，以便后边查找词嵌入模型得到词嵌入向量，并构建嵌入层矩阵。

In [2]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
# 实例化Tokenizer对象
tokenizer = Tokenizer()
# 构建词汇表
tokenizer.fit_on_texts(documents)
# 输出词汇表长度
vocab_size = len(tokenizer.word_index) + 1
# 向量化文本集
encodeDocuments = tokenizer.texts_to_sequences(documents)
# 输出文本集向量
print(encodeDocuments)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]


## 固定句子长度

In [4]:
from keras.preprocessing.sequence import pad_sequences

In [5]:
max_length = 4
paddedDocuments = pad_sequences(encodeDocuments, maxlen=max_length, padding='post')
print(paddedDocuments)

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


## 加载GloVec模型

In [6]:
# 加载GloVec模型
inMemoryGlove = dict()
f = open('../data/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefficients = np.asarray(values[1:], dtype='float32')
    inMemoryGlove[word] = coefficients
f.close()
print(len(inMemoryGlove))

400000


## 查询GloVec模型构建嵌入层参数矩阵

In [7]:
# 实例化参数矩阵
trainingToEmbeddings = np.zeros((vocab_size, 100))
# 读入向量
for word, i in tokenizer.word_index.items():
    gloveVector = inMemoryGlove.get(word)
    if gloveVector is not None:
        trainingToEmbeddings[i] = gloveVector

## 构造网络模型并编译

In [8]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [9]:
model = Sequential()
model.add(Embedding(vocab_size, 
                    100,  # 由于读入的GloVe向量模型维度为100，所以嵌入层输出维度为100
                    weights=[trainingToEmbeddings],  #  用构建的参数矩阵初始化嵌入层参数
                    input_length=max_length, 
                    trainable=False))  # 嵌入层参数矩阵不再随着训练改变
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________


## 训练

In [10]:
model.fit(paddedDocuments, labels, 
          epochs=100, 
          validation_split=0.2,
          verbose=1)

Train on 8 samples, validate on 2 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch

<keras.callbacks.History at 0x135115c18>

## 评估

注意这个示例中没有分训练集和验证集，因为数据量太少了。如果分成训练集和验证集，则验证集的准确率为0。

In [11]:
loss, accuracy = model.evaluate(paddedDocuments, labels, verbose=0)
print('Accuracy: %f' % (accuracy * 100))

Accuracy: 89.999998
