# 训练深度神经网络

## Batch normalization批量标准化

In [19]:
from utils import (reset_tf_graph, show_tf_graph)
import tensorflow as tf
import numpy as np

In [20]:
# 加载mnist数据
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [21]:
from functools import partial

reset_tf_graph()

n_inputs = 28*28 # minst里的图像分辨率是28*28
n_hidden1 = 300 # 第一个隐藏层的神经元数量
n_hidden2 = 100
n_outputs = 10 # 分类10个数字

batch_norm_momentum = 0.9 # 标准化的偏移量

learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    he_init = tf.variance_scaling_initializer() # 使用HE初始化权重

    my_batch_norm_layer = partial( # 类似做函数的柯里化curring, 填入部分函数值生成新的函数
            tf.layers.batch_normalization,
            training=training,
            momentum=batch_norm_momentum)

    my_dense_layer = partial(
            tf.layers.dense,
            kernel_initializer=he_init)

    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1)) # 标准化后再执行激活函数ELU - 指数线性单元
    hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
    logits = my_batch_norm_layer(logits_before_bn)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

Instructions for updating:
Use tf.cast instead.


In [22]:
n_epochs = 20
batch_size = 200

# 定义一个从训练集中随机挑选50个实例的方法
def batch_generator(X, y, size):
    rnd_idx = np.random.permutation(len(X)) # len(X)是矩阵X的第0维的长度, 生成0..len(X)的随机数
    n_batches = len(X) // batch_size # //是整除, mod
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch # yield定义了生成器generator, 可以用next(generator)来调用, 也可以用for循环调用

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in batch_generator(X_train, y_train, batch_size):
            sess.run([training_op, extra_update_ops],
                     feed_dict={training: True, X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print(epoch, "Validation accuracy:", accuracy_val)

    save_path = saver.save(sess, "./chpt11/my_model_final.ckpt")
    
file_writer = tf.summary.FileWriter("./chpt11/graph", tf.get_default_graph())

0 Validation accuracy: 0.8952
1 Validation accuracy: 0.9202
2 Validation accuracy: 0.9318
3 Validation accuracy: 0.9422
4 Validation accuracy: 0.9468
5 Validation accuracy: 0.954
6 Validation accuracy: 0.9568
7 Validation accuracy: 0.96
8 Validation accuracy: 0.962
9 Validation accuracy: 0.9638
10 Validation accuracy: 0.9662
11 Validation accuracy: 0.9682
12 Validation accuracy: 0.9672
13 Validation accuracy: 0.9696
14 Validation accuracy: 0.9706
15 Validation accuracy: 0.9704
16 Validation accuracy: 0.9718
17 Validation accuracy: 0.9726
18 Validation accuracy: 0.9738
19 Validation accuracy: 0.9742


In [23]:
[v.name for v in tf.global_variables()]

['hidden1/kernel:0',
 'hidden1/bias:0',
 'batch_normalization/gamma:0',
 'batch_normalization/beta:0',
 'batch_normalization/moving_mean:0',
 'batch_normalization/moving_variance:0',
 'hidden2/kernel:0',
 'hidden2/bias:0',
 'batch_normalization_1/gamma:0',
 'batch_normalization_1/beta:0',
 'batch_normalization_1/moving_mean:0',
 'batch_normalization_1/moving_variance:0',
 'outputs/kernel:0',
 'outputs/bias:0',
 'batch_normalization_2/gamma:0',
 'batch_normalization_2/beta:0',
 'batch_normalization_2/moving_mean:0',
 'batch_normalization_2/moving_variance:0']

## MAX Norm 最大范数约束正则化

In [39]:
from utils import (reset_tf_graph, show_tf_graph)
import tensorflow as tf
import numpy as np

# 加载mnist数据
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [40]:
reset_tf_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

learning_rate = 0.01
momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

def max_norm_regularizer(threshold, axes=1, name="max_norm",
                         collection="max_norm"):
    def max_norm(weights):
        clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)
        clip_weights = tf.assign(weights, clipped, name=name)
        tf.add_to_collection(collection, clip_weights)
        return None # there is no regularization loss term
    return max_norm

max_norm_reg = max_norm_regularizer(threshold=1.0)

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1", kernel_regularizer=max_norm_reg)
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2", kernel_regularizer=max_norm_reg)
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
    training_op = optimizer.minimize(loss)    

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [41]:
# 定义权重裁剪操作

threshold = 1.0
weights = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
clipped_weights = tf.clip_by_norm(weights, clip_norm=threshold, axes=1)
clip_weights = tf.assign(weights, clipped_weights)

weights2 = tf.get_default_graph().get_tensor_by_name("hidden2/kernel:0")
clipped_weights2 = tf.clip_by_norm(weights2, clip_norm=threshold, axes=1)
clip_weights2 = tf.assign(weights2, clipped_weights2)

In [42]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 20
batch_size = 50

In [43]:
# 定义一个从训练集中随机挑选50个实例的方法
def batch_generator(X, y, size):
    rnd_idx = np.random.permutation(len(X)) # len(X)是矩阵X的第0维的长度, 生成0..len(X)的随机数
    n_batches = len(X) // batch_size # //是整除, mod
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch # yield定义了生成器generator, 可以用next(generator)来调用, 也可以用for循环调用

with tf.Session() as sess:                                              
    init.run()                                                          
    for epoch in range(n_epochs):                                       
        for X_batch, y_batch in batch_generator(X_train, y_train, batch_size): 
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            clip_weights.eval()
            clip_weights2.eval()                                        
        acc_valid = accuracy.eval(feed_dict={X: X_valid, y: y_valid})   
        print(epoch, "Validation accuracy:", acc_valid)                 

    save_path = saver.save(sess, "./chpt11/model/max_norm.ckpt")
    
file_writer = tf.summary.FileWriter("./chpt11/graph/max_norm", tf.get_default_graph())

0 Validation accuracy: 0.9562
1 Validation accuracy: 0.9702
2 Validation accuracy: 0.9728
3 Validation accuracy: 0.9754
4 Validation accuracy: 0.9748
5 Validation accuracy: 0.9776
6 Validation accuracy: 0.9798
7 Validation accuracy: 0.9814
8 Validation accuracy: 0.9816
9 Validation accuracy: 0.9818
10 Validation accuracy: 0.982
11 Validation accuracy: 0.9802
12 Validation accuracy: 0.9808
13 Validation accuracy: 0.9808
14 Validation accuracy: 0.982
15 Validation accuracy: 0.9814
16 Validation accuracy: 0.9814
17 Validation accuracy: 0.983
18 Validation accuracy: 0.982
19 Validation accuracy: 0.9822


## 习题

### 8.深度学习。

i.  建立一个 DNN,有五个隐藏层,每层 100 个神经元,使用 He 初始化和 ELU 激活函数。

ii.  使用 Adam 优化和提前停止,请尝试在 MNIST 上进行训练,但只能使用数字 0 到4,因为我们将在下一个练习中在数字 5 到 9 上进行迁移学习。 您需要一个包含五个神经元的 softmax 输出层,并且一如既往地确保定期保存检查点,并保存最终模型,以便稍后再使用它。

iii.  使用交叉验证调整超参数,并查看你能达到什么准确度。

iv.  现在尝试添加批量标准化并比较学习曲线:它是否比以前收敛得更快? 它是否会产生更好的模型? v.  模型是否过拟合训练集? 尝试将 dropout 添加到每一层,然后重试。 它有帮助吗?

In [74]:
from utils import (reset_tf_graph, show_tf_graph)
import tensorflow as tf
import numpy as np
from functools import partial

In [75]:
# 加载mnist数据
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [76]:
# 取数据集中0~4的部分

idx_04 = y_train <= 4
X_train_04 = X_train[idx_04]
print(len(X_train_04))
y_train_04 = y_train[idx_04]
print(len(y_train_04))
X_valid_04, X_train_04 = X_train_04[:3000], X_train_04[3000:]
y_valid_04, y_train_04 = y_train_04[:3000], y_train_04[3000:]

28038
28038


#### 开始构建模型

In [77]:
n_inputs = 28*28
n_outputs = 5
n_hidden_neurons = 500
n_hidden_layers = 5

reset_tf_graph()

# 定义输入参数
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

# 定义层
he_init = tf.initializers.variance_scaling()
my_hidden_layer = partial(tf.layers.dense, 
                          activation=tf.nn.elu,
                          kernel_initializer=he_init)

def make_dnn(inputs):
    with tf.name_scope("dnn") as scope:
        for n in range(n_hidden_layers):
            inputs = tf.layers.dense(inputs, n_hidden_neurons, 
                                     activation=tf.nn.elu, kernel_initializer=he_init,
                                     name="hidden%d" % (n + 1))
        return inputs

dnn_outputs = make_dnn(X)
logits = tf.layers.dense(dnn_outputs, n_outputs, 
                         kernel_initializer=he_init,
                        name="logits")
y_probs = tf.nn.softmax(logits, name="y_probs")

# 定义交叉熵损失函数
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

# 定义训练操作
learning_rate = 0.01
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [None]:
n_epochs = 1000
batch_size = 20
n_epochs_before_stop = 20

init = tf.global_variables_initializer()
saver = tf.train.Saver()

best_loss = np.infty

# 定义一个从训练集中随机挑选50个实例的方法
def batch_generator(X, y, size):
    rnd_idx = np.random.permutation(len(X)) # len(X)是矩阵X的第0维的长度, 生成0..len(X)的随机数
    n_batches = len(X) // batch_size # //是整除, mod
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch # yield定义了生成器generator, 可以用next(generator)来调用, 也可以用for循环调用
        
with tf.Session() as sess:
    init.run()
    n_epochs_after_best = 0
    for epoch in range(n_epochs):
        for X_batch, y_batch in batch_generator(X_train_04, y_train_04, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        
        the_loss, acc_valid = sess.run([loss, accuracy], feed_dict={X: X_valid_04, y: y_valid_04})
        print(epoch, "loss: ", the_loss)
        print(epoch, "Validation accuracy:", acc_valid)
        
        if the_loss < best_loss:
            best_loss = the_loss
            n_epochs_after_best = 0
            save_path = saver.save(sess, "./chpt11/model/ex8_1.ckpt")
        else:
            n_epochs_after_best += 1
            if n_epochs_after_best > n_epochs_before_stop:
                print("Early stopping! ")
                break
        print("epoch: {}, validation loss: {:.6f}, best loss: {:.6f}, accuracy: {:.2f}%".format(
            epoch, the_loss, best_loss, acc_valid * 100))
    


0 loss:  2.3302152
0 Validation accuracy: 0.19533333
epoch: 0, validation loss: 2.330215, best loss: 2.330215, accuracy: 19.53%
1 loss:  2.3580432
1 Validation accuracy: 0.17666666
epoch: 1, validation loss: 2.358043, best loss: 2.330215, accuracy: 17.67%
2 loss:  1.7067422
2 Validation accuracy: 0.20833333
epoch: 2, validation loss: 1.706742, best loss: 1.706742, accuracy: 20.83%
3 loss:  1.9650798
3 Validation accuracy: 0.20133333
epoch: 3, validation loss: 1.965080, best loss: 1.706742, accuracy: 20.13%
4 loss:  2.0793557
4 Validation accuracy: 0.20133333
epoch: 4, validation loss: 2.079356, best loss: 1.706742, accuracy: 20.13%
