In [1]:
%config Completer.use_jedi = True

In [2]:
from keras.datasets import imdb
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics
import numpy as np

In [3]:
# 将训练数据解码为字符串
def decode_review(index):
    word_index = imdb.get_word_index() # e.g "hello" : 123
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # e.g 123 : "hello"
    decode_review = " ".join([reverse_word_index.get(i - 3, '?') for i in train_data[index]])
    print(decode_review)

In [4]:
# 将整数序列训练数据编码为二进制矩阵
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        # e.g results[5, [4, 5, 6]] = 1
        # results[5][4] and results[5][5] and results[5][6] are set as 1
        # other elements remain 0
        results[i, sequence] = 1.
    return results

In [5]:
# data: 二维列表 每行是一条评论 行的内容是单词编号组成的列表
# label: 一维列表 元素由0和1构成 表示对应的评论情感为积极还是消极
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
# train (n, 10000): 把原本每一行编码为10000维向量 每个维度的值为0或1 表示编号为该维度的单词是否存
# label [1. 0. 1. 0. ......]: 由0和1构成的numpy数组 表示该索引的评论情感 
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = np.asarray(train_labels).astype("float32") 
y_test = np.asarray(test_labels).astype("float32")

In [6]:
# 构建神经网络
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu')) # input shape 自动兼容上层的 output
model.add(layers.Dense(1, activation='sigmoid')) # 最后一层只有一个神经元 输出一个标量


In [7]:
# 编译模型 配置优化器 损失函数 评估模型的指标
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

In [8]:
# 留出验证集
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]

In [9]:
# 训练模型
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [10]:
# 显示训练数据
history_dict = history.history
print("Val acc:", history_dict['val_acc'])
print("Acc:", history_dict['acc'])
print("Val loss:", history_dict['val_loss'])
print("Loss:", history_dict['loss'])

Val acc: [0.8568000197410583, 0.8780999779701233, 0.8878999948501587, 0.8895000219345093, 0.8881000280380249, 0.8880000114440918, 0.8827000260353088, 0.8834999799728394, 0.8812999725341797, 0.8788999915122986, 0.8776999711990356, 0.8795999884605408, 0.8765000104904175, 0.8772000074386597, 0.875, 0.8752999901771545, 0.8738999962806702, 0.871999979019165, 0.8708000183105469, 0.8702999949455261]
Acc: [0.7152000069618225, 0.8881999850273132, 0.9143333435058594, 0.9295333623886108, 0.9397333264350891, 0.9522666931152344, 0.9580666422843933, 0.9630666375160217, 0.9708666801452637, 0.9765999913215637, 0.9796666502952576, 0.9846000075340271, 0.9865333437919617, 0.9905999898910522, 0.9919333457946777, 0.9940666556358337, 0.9946666955947876, 0.9959333539009094, 0.9976666569709778, 0.9979333281517029]
Val loss: [0.4653315842151642, 0.3489483892917633, 0.30017557740211487, 0.2830134630203247, 0.2770485281944275, 0.27936699986457825, 0.2957247793674469, 0.2942790985107422, 0.31204092502593994, 0.32

In [11]:
# 绘制训练损失和验证损失 省略
# 训练数据中发现模型过拟合

In [13]:
# 在第三轮之后停止训练来避免过拟合（其他方法在后面介绍）
model.fit(x_train, y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)
print("Stopped at 4th epochs.")
print("Loss:", results[0])
print("Accuracy:", results[1])

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Stopped at 4th epochs.
Loss: 0.4927731156349182
Accuracy: 0.8611199855804443
