In [127]:
import tensorflow as tf
import keras
import datetime
import matplotlib
import itertools
matplotlib.use("Agg")  #这个设置可以使matplotlib保存.png图到磁盘
import matplotlib.pyplot as plt
from functools import partial
import numpy as np
import pandas as pd
import os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [128]:
#jupyter 中开启该选项，否则不执行
%matplotlib inline 


配置超参数

In [129]:
batch_size=64
epochs=5
regularizer=1e-3
total_train_samples=60000
total_test_samples=10000
lr_decay_epochs=1
output_folder="./model_output"
#用来保存模型以及我们需要的所有东西
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
save_format="hdf5" #或saved_model
if save_format=="hdf5":
    save_path_models=os.path.join(output_folder,"hdf5_models")
    if not os.path.exists(save_path_models):
        os.makedirs(save_path_models)
    model_save_path=os.path.join(save_path_models,"ckpt_epoch{epoch:02d}_val_acc{val_accuracy:.2f}.hdf5")
    
elif save_format=="saved_model":
    save_path_models=os.path.join(output_folder,"saved_models")
    if not os.path.exists(save_path_models):
        os.makedirs(save_path_models)
    model_save_path=os.path.join(save_path_models,"ckpt_epoch{epoch:02d}_val_acc{val_accuracy:.2f}.ckpt")
#用来保存日志
log_dir= os.path.join(output_folder,'logs_{}'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
if not os.path.exists(log_dir):
    os.makedirs(log_dir)


选择指定显卡及自动调用显存

In [130]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')#列出所有可见显卡
print("All the available GPUs:\n",physical_devices)
if physical_devices:
    gpu=physical_devices[0]#显示第一块显卡
    tf.config.experimental.set_memory_growth(gpu, True)#根据需要自动增长显存
    tf.config.experimental.set_visible_devices(gpu, 'GPU')#只选择第一块


All the available GPUs:
 []


准备数据

In [131]:
fashion_mnist=keras.datasets.fashion_mnist
(train_x,train_y),(test_x,test_y)=fashion_mnist.load_data()

train_x,test_x = train_x[...,np.newaxis]/255.0,test_x[...,np.newaxis]/255.0
total_train_sample = train_x.shape[0]
total_test_sample=test_x.shape[0]
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']


使用tf.data来准备训练集和测试集

In [132]:
train_ds = tf.data.Dataset.from_tensor_slices((train_x,train_y))
test_ds = tf.data.Dataset.from_tensor_slices((test_x,test_y))
 
train_ds=train_ds.shuffle(buffer_size=batch_size*10).batch(batch_size).prefetch(buffer_size = tf.data.experimental.AUTOTUNE).repeat()
test_ds = test_ds.batch(batch_size).prefetch(buffer_size = tf.data.experimental.AUTOTUNE)#不加repeat，执行一次就行


准备模型定义

In [133]:
l2 = keras.regularizers.l2(regularizer)#定义模型正则化方法
ini = keras.initializers.he_normal()#定义参数初始化方法
conv2d = partial(keras.layers.Conv2D,activation='relu',padding='same',kernel_regularizer=l2,bias_regularizer=l2)
fc = partial(keras.layers.Dense,activation='relu',kernel_regularizer=l2,bias_regularizer=l2)
maxpool=keras.layers.MaxPooling2D
dropout=keras.layers.Dropout



In [134]:
x_input = keras.layers.Input(shape=(28,28,1),name='input_node')
x = conv2d(128,(5,5))(x_input)
x = maxpool((2,2))(x)
x = conv2d(256,(5,5))(x)
x = maxpool((2,2))(x)
x = keras.layers.Flatten()(x)
x = fc(128)(x)
x_output=fc(10,activation=None,name='output_node')(x)
model = keras.models.Model(inputs=x_input,outputs=x_output)                



打印模型结构

In [135]:
print("The model architure:\n")
print(model.summary())


The model architure:



None


画出模型结构图并保存

In [136]:
keras.utils.plot_model(model,to_file=os.path.join(log_dir,'model.png'),show_shapes=True,show_layer_names=True)


You must install pydot (`pip install pydot`) for `plot_model` to work.


定义优化算法和损失函数

In [137]:
#学习率变化设置，使用指数衰减
train_steps_per_epoch=int(total_train_samples//batch_size)
initial_learning_rate=0.01
# lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate,
#                                                              decay_steps=1*train_steps_per_epoch,
#                                                             decay_rate=0.96,
#                                                             staircase=True)#initial_learning_rate*0.96**(step/decay_steps)
#优化算法
optimizer = keras.optimizers.SGD(learning_rate=initial_learning_rate,momentum=0.95)
# optimizer = keras.optimizers.SGD(learning_rate=lr_schedule,momentum=0.95)
#损失函数
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#评价指标
# metrics=[keras.metrics.SparseCategoricalAccuracy(),loss]
metrics=['accuracy','sparse_categorical_crossentropy']
#上式第二个参数会返回交叉熵的结果，用loss减去该值就会得到正则化的值（于model.losses和相等），这两种定义方式都可以，下边的会显示名称短一些




编译模型

In [138]:
model.compile(optimizer=optimizer,loss=loss,metrics=metrics)


# 定义callbacks
1、生成ckpt每个epoch训练完保存一个模型

In [139]:
#模型保存格式默认是saved_model,可以自己定义更改原有类来保存hdf5
ckpt = keras.callbacks.ModelCheckpoint(model_save_path.replace('.hdf5', '.keras'), monitor='val_accuracy', verbose=1, save_best_only=False, save_weights_only=False, save_freq='epoch', mode='auto')


monitor可选的内容与model.fit history的keys相同，如“loss",“val_accuracy”,“val_loss”,使用这前可以查看一下有那些；save_best_only的True或False作用显然；save_weights_only的True或False指是否只保存模型参数；verbose=1时会打印日志；另外save_path有两种格式，h5或saved model,如save_path='model.h5’将自动保存h5模型（无论save_weights_only是True还是False),save_path=‘model’,将自动保存savedmodel,这时save_weights_only是True或False情况下格式略有不同。

2、当模型验证集精度长时间不再提升时停止训练

In [140]:
#当模型训练不符合我们要求时停止训练，连续5个epoch验证集精度没有提高0.001%停
earlystop=keras.callbacks.EarlyStopping(monitor='val_accuracy',min_delta = 0.0001,patience=5,verbose=True)


In [141]:
#3、自定义学习率按需衰减，并把整个学习率变化过程保存
class LearningRateExponentialDecay:

    def __init__(self, initial_learning_rate, decay_epochs, decay_rate):
        self.initial_learning_rate = initial_learning_rate
        self.decay_epochs = decay_epochs
        self.decay_rate = decay_rate

    def __call__(self, epoch):
        dtype = type(self.initial_learning_rate)
        decay_epochs = np.array(self.decay_epochs).astype(dtype)
        decay_rate = np.array(self.decay_rate).astype(dtype)
        epoch = np.array(epoch).astype(dtype)
        p = epoch / decay_epochs
        lr = self.initial_learning_rate * np.power(decay_rate, p)
        return lr


lr_schedule = LearningRateExponentialDecay(initial_learning_rate, lr_decay_epochs, 0.96)
lr = keras.callbacks.LearningRateScheduler(lr_schedule, verbose=1)
#使用tensorboard
#定义当loss出现nan或inf时停止训练的callback
terminate = keras.callbacks.TerminateOnNaN()

#模型损失长时间不除时大程度降低学习率
# 这个策略通常不于学习率衰减schedule同时使用，或者使用时要合理
#降低学习率（要比学习率自动周期变化有更大变化和更长时间监控）
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                              factor=0.1,
                                              patience=3,
                                              verbose=1,
                                              min_delta=0.0001,
                                              min_lr=0)
# 模型损失长时间不除时大程度降低学习率
# 这个策略通常不于学习率衰减schedule同时使用，或者使用时要合理

#降低学习率（要比学习率自动周期变化有更大变化和更长时间监控）
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                              factor=0.1,
                                              patience=3,
                                              verbose=1,
                                              min_delta=0.0001,
                                              min_lr=0)
#保存训练过程中大数标量指标，与tensorboard同一个文件
csv_logger = keras.callbacks.CSVLogger(os.path.join(log_dir, 'logs.log'), separator=',')

#还要加入tensorboard的使用,这种方法记录的内容有限
tensorboard = keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,  #对参数和激活做直方图，一定要有测试集
    write_graph=True,  #模型结构图
    write_images=True,  #把模型参数做为图片形式存到
    update_freq='epoch',  #epoch,batch,整数，太频的话会减慢速度
    profile_batch=2,  #记录模型性能
    embeddings_freq=1,
    embeddings_metadata=None  #这个还不太清楚如何使用
)
#各个参数的作用请参看文档，需要正确使用


In [142]:
import itertools
import os
import io
file_writer_cm = tf.summary.create_file_writer(log_dir, filename_suffix='cm')


def plot_to_image(figure, logd_ir, epoch):
    """Converts the matplotlib plot specified by 'figure' to a PNG image and
    returns it. The supplied figure is closed and inaccessible after this call."""
    # Save the plot to a PNG in memory.
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    fig = figure
    fig.savefig(os.path.join(log_dir, 'during_train_confusion_epoch_{}.png'.format(epoch)))
    # Closing the figure prevents it from being displayed directly inside
    # the notebook.
    plt.close(figure)
    buf.seek(0)
    # Convert PNG buffer to TF image
    image = tf.image.decode_png(buf.getvalue(), channels=4)
    # Add the batch dimension
    image = tf.expand_dims(image, 0)
    return image


def plot_confusion_matrix(cm, class_names):
    """
    Returns a matplotlib figure containing the plotted confusion matrix.

    Args:
    cm (array, shape = [n, n]): a confusion matrix of integer classes
    class_names (array, shape = [n]): String names of the integer classes
    """
    figure = plt.figure(figsize=(8, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion matrix")
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    # Normalize the confusion matrix.
    cm = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], decimals=2)

    # Use white text if squares are dark; otherwise black.
    threshold = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        color = "white" if cm[i, j] > threshold[i] else "black"
        plt.text(j, i, cm[i, j], horizontalalignment="center", color=color)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return figure


def log_confusion_matrix(epoch, logs):
    # Use the model to predict the values from the validation dataset.
    test_pred_raw = model.predict(test_x)
    test_pred = np.argmax(test_pred_raw, axis=1)

    # Calculate the confusion matrix.
    cm = confusion_matrix(test_y, test_pred)
    # Log the confusion matrix as an image summary.
    figure = plot_confusion_matrix(cm, class_names=class_names)
    cm_image = plot_to_image(figure, log_dir, epoch)

    # Log the confusion matrix as an image summary.
    with file_writer_cm.as_default():
        tf.summary.image("Confusion Matrix", cm_image, step=epoch)


# Define the per-epoch callback.
cm_callback = keras.callbacks.LambdaCallback(on_epoch_end=log_confusion_matrix)


In [144]:
callbacks = [ckpt, earlystop, lr, tensorboard, terminate, reduce_lr, csv_logger]  #, cm_callback
# H=model.fit(train_ds,epochs=epochs,steps_per_epoch=len(train_x)//batch_size,validation_data=test_ds)
train_steps_per_epoch = np.floor(total_train_samples / batch_size).astype(np.int32)
test_steps_per_epoch = np.ceil(total_test_sample / batch_size).astype(np.int32)
H = model.fit(train_ds,
              epochs=epochs,
              steps_per_epoch=train_steps_per_epoch,
              validation_data=test_ds,
              validation_steps=test_steps_per_epoch,
              callbacks=callbacks,
              verbose=1)
#使用tf.data无法获取训练和测试总数，所以指定训练步数合使训练稳定



Epoch 1: LearningRateScheduler setting learning rate to 0.01.
Epoch 1/5


[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step - accuracy: 0.8754 - loss: 0.5998 - sparse_categorical_crossentropy: 1.2982
Epoch 1: saving model to ./model_output\hdf5_models\ckpt_epoch01_val_acc0.88.keras
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 284ms/step - accuracy: 0.8754 - loss: 0.5998 - sparse_categorical_crossentropy: 1.2981 - val_accuracy: 0.8815 - val_loss: 0.5146 - val_sparse_categorical_crossentropy: 1.2672 - learning_rate: 0.0100

Epoch 2: LearningRateScheduler setting learning rate to 0.0096.
Epoch 2/5
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step - accuracy: 0.8915 - loss: 0.4708 - sparse_categorical_crossentropy: 1.2244
Epoch 2: saving model to ./model_output\hdf5_models\ckpt_epoch02_val_acc0.00.keras


  self.gen.throw(typ, value, traceback)


[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 257ms/step - accuracy: 0.8915 - loss: 0.4708 - sparse_categorical_crossentropy: 1.2244 - val_accuracy: 0.0000e+00 - val_loss: 0.0000e+00 - val_sparse_categorical_crossentropy: 0.0000e+00 - learning_rate: 0.0096

Epoch 3: LearningRateScheduler setting learning rate to 0.009216.
Epoch 3/5
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step - accuracy: 0.9004 - loss: 0.4109 - sparse_categorical_crossentropy: 1.1877
Epoch 3: saving model to ./model_output\hdf5_models\ckpt_epoch03_val_acc0.89.keras
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 270ms/step - accuracy: 0.9004 - loss: 0.4109 - sparse_categorical_crossentropy: 1.1877 - val_accuracy: 0.8939 - val_loss: 0.4308 - val_sparse_categorical_crossentropy: 1.1488 - learning_rate: 0.0092

Epoch 4: LearningRateScheduler setting learning rate to 0.008847359999999999.
Epoch 4/5
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

KeyboardInterrupt: 

开始训练模型

In [None]:
# H=model.fit(train_ds,epochs=epochs,steps_per_epoch=len(train_x)//batch_size,validation_data=test_ds)
train_steps_per_epoch=np.floor(total_train_samples/batch_size).astype(np.int32)
test_steps_per_epoch = np.ceil(total_test_sample/batch_size).astype(np.int32)
H=model.fit(train_ds,epochs=epochs,
            steps_per_epoch=train_steps_per_epoch,
            validation_data=test_ds,
            validation_steps=test_steps_per_epoch,
            callbacks=callbacks,verbose=1)
#使用tf.data无法获取训练和测试总数，所以指定训练步数合使训练稳定



In [None]:
#画学习率变化曲线并保存到log中
def plot(lrs,title="Learning Rate Schedule"):
    #计算学习率随epoch的变化值
    epochs=np.arange(len(lrs))
    plt.figure()
    plt.plot(epochs,lrs)
    plt.xticks(epochs)
    plt.scatter(epochs,lrs)
    plt.title(title)
    plt.xlabel("Epoch #")
    plt.ylabel("Learning Rate")
plot(H.history['lr'])
plt.savefig(os.path.join(log_dir,'learning_rate.png'))

画模型训练及验证loss和acc曲线

In [None]:
N = H.epochs
plt.figure()
plt.plot(N, H.history['loss'], label='train_loss')
plt.scatter(N, H.history['loss'])
plt.plot(N, H.history['val_loss'], label='val_loss')
plt.scatter(N, H.history['val_loss'])
plt.plot(N, H.history['accuracy'], label='train_acc')
plt.scatter(N, H.history['accuracy'])
plt.plot(N, H.history['val_accuracy'], label='val_acc')
plt.scatter(N, H.history['val_accuracy'])
plt.title('Training Loss and Accuracy on Our_dataset')
plt.xlabel('Epoch #')
plt.ylabel('Loss/Accuracy')
plt.legend()
plt.savefig(os.path.join(log_dir, 'training.png'))


In [None]:
# 保存模型结构及配置参数
model_json = model.to_json()
with open(os.path.join(log_dir,'model_json.json'),'w') as json_file:
    json_file.write(model_json)


In [None]:
#对模型在测试集上进行评估
metrics=model.evaluate(test_ds,verbose=1)
print("val_loss:",metrics[0],"val_accuracy:",metrics[1])


In [None]:
predictions=model.predict(test_ds,verbose=1)


In [None]:
def print_metrics(labels, predictions,target_names,save=False,save_path=None):
    # 计算confusion result
    preds=np.argmax(predictions,axis=-1)
    confusion_result = confusion_matrix(labels, preds)
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1500)
    confusion_result = pd.DataFrame(confusion_result, index = target_names, columns = target_names)
    # classification report
    report = classification_report(labels, preds, target_names = target_names, digits = 4)
    result_report = 'Confuse_matrix:\n{}\n\nClassification_report:\n{} \n'.format(confusion_result, report)
    print(result_report)
    if save:   

        savepath = os.path.join(save_path,"predicted_result.txt")

        print('the result saved in %s' % savepath)#如果savepath相同的话,会把所有结果保存到同一个文件中

        with open(savepath, 'w') as f:
            f.write(result_report)



In [None]:
print_metrics(test_y,predictions,class_names,True,log_dir)


In [None]:
%%bash
tree model_output
