In [1]:
import numpy as np

# 整体结构
- 除了之前的全连接层`Affine`，新出现了卷积层(Convolution)和池化层(Pooling)
- `Affine - ReLU` 变成了 `Conv - ReLU - (Pooling)`


- 全连接层忽略了数据的形状，因此无法利用与形状相关的数据
    - 如高、宽、通道的 3 维图像数据，输入全连接层时，会被处理成一列的形式
    - 而图像数据的形状中含有重要的空间信息，如空间相邻的像素为相似的值，相距较远的像素没有什么关联，`RGB`各个通道之间有关联，三维形状中可能隐藏有本质模式
- `CNN` 中保持形状不变。将卷积层的输入数据称为“输入特征图(`feature map`)”    ，输出数据称为“输出特征图”。
- 卷积层识别出输入中的模式(如图像中的横向或纵向线条)，随着卷积层的叠加，可以逐渐识别出输入的高阶模式(如图像中的特定形状)

- 卷积运算：滤波器(核) - 填充 - 步幅；
    - 滤波器(特征探测器)沿着输入数据扫描，将滤波器元素和对应的输入数据累加求和
   
    - 输入大小$(H,W)$，滤波器大小$(FW,FH)$，填充为$P$，步幅为$S$，则输出为$OH=\frac{H+2P-FH}{S}+1,OW=\frac{W+2P-FW}{S}+1$
               
    - 多通道的输入数据，单个滤波器的通道数要与输入通道相同，每个通道的滤波器和对应通道的数据乘积累加后再求和，输入大小$(C,H,W)$，滤波器大小$(C,FW,FH)$，输出$(1,OH,OW)$；即输出是通道数为 1 的特征图
                     
    - 多个滤波器时，输出则为多通道的特征图；输入大小$(C,H,W)$，滤波器大小$(FN,C,FW,FH)$，滤波后输出$(FN,OH,OW)$，再加上偏置$(FN,1,1)$，最终输出$(FN,OH,OW)$
    
    
    
    
- 池化运算，池化是缩小高、长方向上空间的运算，通道数不发生变化
    - 池化层对输入数据进行采样，减小计算量、参数数量及内存使用，抑制过拟合的风险
    - 池化层从目标区域中取最大值或平均值，仅仅因此没有要学习的参数
    
    - 一般来说，池化窗口的大小会和步长设定成相同的值
    - 输入数据发生微小偏差时，池化返回相同的结果。因此池化对微小的位置变化具有鲁棒性

# 卷积层的实现

## 输入数据预处理
- 可以对`Numpy`格式输入数据使用多重`for`循环实现卷积运算，运行较慢；利用`Numpy`中的矩阵乘法实现，输入数据需要预处理
![](../images/im2col.png)
- `(A)`为3维的输入数据$(C,H,W)$
- `(B)`将滤波器依次扫过的区域提取出来，$(C,FH,FW,OH,OW)$
- `(C)`将每个小块的数据$(C,FH,FW)$展平成$C\times FW\times FH$，整个数据转换成二维$(OH\times OW,C\times FW\times FH)$

- 将滤波器$(FN,C,FW,FH)$纵向展开为$(C\times FW\times FH, FN)$；


In [8]:
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
    """

    Parameters
    ----------
    input_data : 由(数据量, 通道, 高, 长)的4维数组构成的输入数据
    filter_h : 滤波器的高
    filter_w : 滤波器的长
    stride : 步幅
    pad : 填充

    Returns
    -------
    col : 2维数组(数据量*输出高*输出宽,滤波器高*滤波器宽*通道)
    """
    N, C, H, W = input_data.shape
    out_h = (H + 2 * pad - filter_h) // stride + 1
    out_w = (W + 2 * pad - filter_w) // stride + 1
    img = np.pad(input_data, [(0, 0), (0, 0), (pad, pad), (pad, pad)],
                 'constant')
    col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))

    for y in range(filter_h):
        y_max = y + stride * out_h
        for x in range(filter_w):
            x_max = x + stride * out_w
            col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]
    col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1)
    return col


def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):
    N, C, H, W = input_shape
    out_h = (H + 2 * pad - filter_h) // stride + 1
    out_w = (W + 2 * pad - filter_w) // stride + 1
    col = col.reshape(N, out_h, out_w, C, filter_h,
                      filter_w).transpose(0, 3, 4, 5, 1, 2)
    img = np.zeros((N, C, H + 2 * pad + stride - 1, W + 2 * pad + stride - 1))
    for y in range(filter_h):
        y_max = y + stride * out_h
        for x in range(filter_w):
            x_max = x + stride * out_w
            img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :]
    return img[:, :, pad:H + pad, pad:W + pad]

In [12]:
x = np.random.rand(1, 1, 5, 5)
print(x)
x = im2col(x, 3, 3, 2, 0)
print(x)

[[[[0.38201987 0.97114608 0.35504192 0.75298388 0.50798056]
   [0.31585327 0.62507244 0.92727894 0.65885742 0.93872894]
   [0.6375136  0.32163867 0.78459359 0.88176927 0.49661809]
   [0.90116761 0.82551255 0.10971107 0.98956697 0.67861124]
   [0.12791343 0.94203226 0.38407891 0.04116853 0.09322415]]]]
[[0.38201987 0.97114608 0.35504192 0.31585327 0.62507244 0.92727894
  0.6375136  0.32163867 0.78459359]
 [0.35504192 0.75298388 0.50798056 0.92727894 0.65885742 0.93872894
  0.78459359 0.88176927 0.49661809]
 [0.6375136  0.32163867 0.78459359 0.90116761 0.82551255 0.10971107
  0.12791343 0.94203226 0.38407891]
 [0.78459359 0.88176927 0.49661809 0.10971107 0.98956697 0.67861124
  0.38407891 0.04116853 0.09322415]]


## 反向传播卷积层实现

In [13]:
class Convolution:
    def __init__(self, W, b, stride=1, pad=0):
        self.W = W
        self.b = b
        self.stride = stride
        self.pad = pad

        self.x = None
        self.col = None
        self.col_W = None

        self.dW = None
        self.db = None

    def forward(self, x):
        FN, C, FH, FW = self.W.shape
        N, C, H, W = x.shape
        out_h = 1 + int((H + 2 * self.pad - FH) / self.stride)
        out_w = 1 + int((W + 2 * self.pad - FW) / self.stride)

        col = im2col(x, FH, FW, self.stride, self.pad)
        col_W = self.W.reshape(FN, -1).T

        out = np.dot(col, col_W) + self.b
        out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)

        self.x = x
        self.col = col
        self.col_W = col_W
        return out

    def backward(self, dout):
        FN, C, FH, FW = self.W.shape
        dout = dout.transpose(0, 2, 3, 1).reshape(-1, FN)
        self.db = np.sum(dout, axis=0)
        self.dW = np.dot(self.col.T, dout)
        self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW)

        dcol = np.dot(dout, self.col_W.T)
        dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)
        return dx

In [14]:
x = np.random.rand(1, 1, 5, 5)
print(x)
W = np.random.rand(1, 1, 3, 3)
b = np.random.rand(1, 1, 1)
conv = Convolution(W, b, 2, 0)
x = conv.forward(x)
print(x)

[[[[0.47333855 0.30416613 0.34319674 0.73066228 0.43317542]
   [0.69969408 0.85851103 0.0565631  0.1898052  0.23868948]
   [0.66851131 0.57649214 0.94791586 0.03002559 0.00995395]
   [0.63725434 0.07555025 0.76421214 0.64179982 0.53745217]
   [0.80886616 0.68542613 0.48702812 0.74047263 0.17961829]]]]
[[[[2.34473842 1.41175451]
   [2.61711026 2.79379658]]]]


## 池化层的实现

In [3]:
class Pooling:
    def __init__(self, pool_h, pool_w, stride=1, pad=0):
        self.pool_h = pool_h
        self.pool_w = pool_w
        self.stride = stride
        self.pad = pad

    def forward(self, x):
        N, C, H, W = x.shape
        out_h = int(1 + (H - self.pool_h) / self.stride)
        out_w = int(1 + (W - self.pool_w) / self.stride)
        col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)
        col = col.reshape(-1, self.pool_h * self.pool_w)

        arg_max = np.argmax(col, axis=1)
        out = np.max(col, axis=1)
        out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
        self.x = x
        self.arg_max = arg_max
        return out

    def backward(self, dout):
        dout = dout.transpose(0, 2, 3, 1)
        pool_size = self.pool_h * self.pool_w
        dmax = np.zeros((dout, pool_size))
        dmax[np.arange(self.arg_max.size
                       ), self.arg_max.flatten()] = dout.flatten()
        dmax = dmax.reshape(dout.shape + (pool_size, ))
        dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)
        dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride,
                    self.pad)
        return dx

In [15]:
x = np.random.rand(1, 1, 5, 5)
print(x)
pool = Pooling(3, 3, 2, 0)
x = pool.forward(x)
print(x)

[[[[0.35381993 0.60204659 0.9236718  0.85403515 0.51023189]
   [0.06734814 0.68786701 0.79496244 0.61514594 0.76761991]
   [0.85729933 0.87327673 0.08466223 0.68186728 0.95831023]
   [0.09123023 0.08212613 0.21043752 0.49848096 0.08235595]
   [0.74364385 0.14449196 0.3734369  0.08876523 0.63836527]]]]
[[[[0.9236718  0.95831023]
   [0.87327673 0.95831023]]]]


# CNN模型

In [None]:
from collections import OrderedDict
import pickle


class SimpleConvNet:
    """简单的ConvNet
    conv - relu - pool - affine - relu - affine - softmax
    Parameters
    ----------
    input_size : 输入大小（MNIST的情况下为784）
    hidden_size_list : 隐藏层的神经元数量的列表（e.g. [100, 100, 100]）
    output_size : 输出大小（MNIST的情况下为10）
    activation : 'relu' or 'sigmoid'
    weight_init_std : 指定权重的标准差（e.g. 0.01）
        指定'relu'或'he'的情况下设定“He的初始值”
        指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
    """
    def __init__(self,
                 input_dim=(1, 28, 28),
                 conv_param={
                     'filter_num': 30,
                     'filter_size': 5,
                     'pad': 0,
                     'stride': 1
                 },
                 hidden_size=100,
                 output_size=10,
                 weight_init_std=0.01):
        filter_num = conv_param['filter_num']
        filter_size = conv_param['filter_size']
        pad = conv_param['pad']
        stride = conv_param['stride']
        input_size = input_dim[1]
        conv_output_size = (input_size - filter_size + 2 * pad) / stride + 1
        pool_output_size = int(filter_num * (conv_output_size / 2) *
                               (conv_output_size / 2))

        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(
            filter_num, input_dim[0], filter_size, filter_size)
        self.params['b1'] = np.zeros(filter_num)
        self.params['W2'] = weight_init_std * np.random.randn(
            pool_output_size, hidden_size)
        self.params['b2'] = np.zeros(hidden_size)
        self.params['W3'] = weight_init_std * np.ramdom.randn(
            hidden_size, output_size)
        self.params['b3'] = np.zeros(output_size)

        self.layers = OrderedDict()
        self.layers['Conv1'] = Convolution(self.params['W1'],
                                           self.params['b1'], stride, pad)
        self.layers['Relu1'] = Relu()
        self.layers['Pool1'] = Pooling(pool_h=2, pool_w=2, stride=2)
        self.layers['Affine1'] = Affine(self.params['W2'], self.params['b2'])
        self.layers['Relu2'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W3'], self.params['b3'])
        self.last_layer = SofmaxWithLoss()

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x

    def loss(self, x, t):
        y = self.predict(x)
        return self.last_layer.forward(x)

    def accuracy(self, x, t, batch_size=100):
        if t.ndim != 1:
            t = np.argmax(t, axis=1)

        acc = 0.0
        for i in range(int(x.shape[0] / batch_size)):
            tx = x[i * batch_size:(i + 1) * batch_size]
            tt = t[i * batch_size:(i + 1) * batch_size]
            y = self.predict(tx)
            y = np.argmax(y, axis=1)
            acc += np.sum(y == t)
        return acc / x.shape[0]

    def gradient(self, x, t):
        self.loss(x, t)
        dout = 1
        dout = self.last_layer.backward(dout)

        grads = {}
        grads['W1'], grads['b1'] = self.layers['Conv1'].dW, self.layers[
            'Conv1'].db
        grads['W2'], grads['b2'] = self.layers['Affine1'].dW, self.layers[
            'Affine1'].db
        grads['W3'], grads['b3'] = self.layers['Affine2'].dW, self.layers[
            'Affine2'].db
        return grads

    def save_params(self, file_name='params.pkl'):
        params = {}
        for key, val in self.params.items():
            params[key] = val
        with open(file_name, 'wb') as f:
            pickle.dump(params, f)

    def load_params(self, file_name='params.pkl'):
        with open(file_name, 'rb') as f:
            params = pickle.load(f)
        for i, key in enumerate(['Conv1', 'Affine1', 'Affine2']):
            self.layers[key].W = self.params['W' + str(i)]
            self.layers[key].b = self.params['b' + str(i)]

# 训练模型

In [17]:
class Trainer:
    def __init__(self,
                 network,
                 x_train,
                 t_train,
                 x_test,
                 t_test,
                 epochs=20,
                 mini_batch_size=100,
                 optimizer='SGD',
                 optimizer_params={'lr': 0.01},
                 evaluate_sample_num_per_epoch=None,
                 verbose=True):
        self.network = network
        self.verbose = verbose
        self.x_train = x_train
        self.t_train = t_train
        self.x_test = x_test
        self.t_test = t_test
        self.epochs = epochs
        self.batch_size = mini_batch_size
        self.evaluate_sample_num_per_epoch = evaluate_sample_num_per_epoch

        optimizer_class_dict = {
            'sgd': SGD,
            'momentum': Momentum,
            'nesterov': Nesterov,
            'adagram': AdaGram,
            'rmsprpo': RMSprop,
            'adam': Adam
        }
        self.optimizer = optimizer_class_dict[optimizer.lower()](
            **optimizer_paramsa)
        self.train_size = x_train.shape[0]
        self.iter_per_epoch = max(self.train_size / mini_batch_size, 1)
        self.max_iter = int(epochs * self.iter_per_epoch)
        self.current_iter = 0
        self.current_epoch = 0

        self.train_loss_list = []
        self.train_acc_list = []
        self.test_acc_list = []

    def train_step(self):
        batch_mask = np.random.choice(self.train_size, self.batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]

        grads = self.network.gradient(x_batch, t_batch)
        self.optimizer.update(self.network.params, grads)

        loss = self.network.loss(x_batch, t_batch)
        self.train_loss_list.append(loss)
        if self.verbose:
            print("train loss:" + str(loss))
        if self.current_iter % self.iter_per_epoch == 0:
            self.current_epoch += 1

            x_train_sample, t_train_sample = self.x_train, self.t_train
            x_test_sample, t_test_sample = self.x_test, self.t_test
            if not self.evaluate_sample_num_per_epoch is None:
                t = self.evaluate_sample_num_per_epoch
                x_train_sample, t_train_sample = self.x_train[:
                                                              t], self.t_train[:
                                                                               t]
                x_test_sample, t_test_sample = self.x_test[:t], self.t_test[:t]
            train_acc = self.network.accuracy(x_train_sample, t_train_sample)
            test_acc = self.network.accuracy(x_test_sample, t_test_sample)
            self.train_acc_list.append(train_acc)
            self.test_acc_list.append(test_acc)

            if self.verbose:
                print(
                    f"=== epoch: {str(self.current_epoch)}, train acc: {str(train_acc)}, test acc: {str(test_acc)} ==="
                )
        self.current_iter += 1

    def train(self):
        for i in range(self.max_iter):
            self.train_step()

        test_acc = self.network.accuracy(self.x_test, self.t_test)
        if self.verbose:
            print("====== Final Test Accuracy =====")
            print("test acc: " + str(test_acc))

In [None]:
(x_train, t_train), (x_test, t_test) = load_mnist(flatten=False)

max_epochs = 20
network = SimpleConvNet()
trainer = Trainer(network,
                  x_train,
                  t_train,
                  x_test,
                  y_test,
                  epochs=max_epochs,
                  mini_batch_size=100,
                  optimizer='Adam',
                  optimizer_param={'lr': 0.001},
                  evaluate_sample_num_per_epoch=1000)
trainer.train()

network.save_params('prams.pkl')

markers = {'train': 'o', 'test': 's'}
x = np.arange(max_epochs)
plt.plot(x, trainer.train_acc_list, marker='o', label='train', markevery=2)
plt.plot(x, trainer.test_acc_list, marker='s', label='test', markevery=2)
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()

# 可视化特征图

In [None]:
def filter_show(filters, nx=8, margin=3, scale=10):
    FN, C, FH, FW = filters.shape
    ny = int(np.ceil(FN / nx))
    fig = plt.figure()
    fig.subplots_adjust(left=0,
                        right=1,
                        bottom=0,
                        top=1,
                        hspace=0.05,
                        wspace=0.05)
    for i in range(FN):
        ax = fig.add_subplot(nly, nx, i + 1, xticks=[], yticks=[])
        ax.imshow(filters[i, 0], cmap=plt.gray_Lr, interpolation='nearest')
    plt.show()


network = SimpleConvNet()
filterL_show(network.params['W1'])

network.load_params("params.pkl")
filter_show(network.params['W1'])