In [1]:
import keras

Using TensorFlow backend.


利用Embedding层学习词嵌入

In [2]:
#将一个Embedding层实例化
from keras.layers import Embedding
embedding_layer = Embedding(1000,64)  #Embedding层至少需要两个参数，标记的个数（这里是1000即最大索引值加一）和嵌入维度（这里是64）

In [4]:
#加载IMDB数据，准备用于Embedding层
from keras.datasets import imdb
from keras import preprocessing

max_features = 10000    #作为特征的单词个数
maxlen = 20             #在maxlen个单词后截断文本

(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=max_features)      #将数据加载为整数列表

x_train = preprocessing.sequence.pad_sequences(x_train,maxlen=maxlen)  #将整数列表转换为形状为（samples,maxlen）的二维整数张量
x_test = preprocessing.sequence.pad_sequences(x_test,maxlen=maxlen)

In [5]:
#在IMDB数据上使用Embedding层和分类器
from keras.models import Sequential
from keras.layers import Flatten, Dense

model = Sequential()
#指定Embedding层的最大输入长度，以便后面将嵌入输入展平，Embedding层激活的形状为(samples,maxlen,8)
model.add(Embedding(10000,8,input_length=maxlen))
model.add(Flatten())  #将三维的嵌入张量展平成形状为(samples,maxlen*8)的二维张量
model.add(Dense(1,activation='sigmoid'))   
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
model.summary()

history = model.fit(x_train,y_train,epochs=10,batch_size=32,validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


使用预训练的词嵌入



从原始文本到词嵌入

下载IMDB数据的原始文本

In [9]:
#处理IMDB原始数据的标签
import os
imdb_dir = 'aclImdb'
train_dir = os.path.join(imdb_dir,'train')

labels = []
texts = []

for label_type in ['neg','pos']:
    dir_name = os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name,fname),encoding='utf-8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)