In [None]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: 
# Also add the following code, 
# so that every time the environment (kernel) starts, 
# just run the following code: 
import sys 
# sys.path.append('/home/aistudio/external-libraries') # 添加自定义依赖

## 1、 加载相关库文件，并打印一下paddle的环境

In [None]:
# 加载相关库
import os
import paddle
import paddle.vision.transforms as T
from paddle.nn import Layer
from paddle.vision.datasets import Cifar10
from paddle.nn import Conv2D, MaxPool2D, Linear
from paddle.static import InputSpec
import numpy as np
from visualdl import LogWriter # 用来可视化的库
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# 1 打印环境
print(paddle.__version__) # 2.2.2

## 2、查看一下数据
* 了解训练数据的基本形状和大小
* 了解训练集和测试集的大小
* 计算训练集数据的均值和方差（不能使用测试集，因为测试环境往往是未知的）

In [None]:
# 2 查看一下数据

# 观察少量数据
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
train_dataset = Cifar10(mode='train')
plt.figure()
idx = 0
for img, label in train_dataset: # 产看一个数据五个图像
    if idx==0:
        print(img)
        print(label)
    plt.subplot(1,5,idx+1)
    plt.imshow(img)
    idx += 1
    if idx==5:
        break
plt.show()
# img = paddle.transpose(img,perm=[1,2,0]) # [CHW]变换维度成[HWC]，因为这里没用transpose所以不用变换维度

# 观察整个数据集及数据分布
train_dataset = Cifar10(mode='train',transform=T.ToTensor()) # T.ToTensor是将图片变换通道为[CHW]，并将数据压缩到(0,1)范围
test_dataset = Cifar10(mode='test',transform=T.ToTensor())
print('一个训练数据:\n {}'.format(train_dataset[0])) # Tensor(shape=[3, 32, 32], dtype=float32, place=CPUPlace, stop_gradient=True, ...
print('训练集大小: {}'.format(len(train_dataset))) # 50k
print('一个测试数据:\n {}'.format(test_dataset[0])) # 一般测试数据是没有标签的
print('测试集大小: {}'.format(len(test_dataset))) # 10k

means = paddle.zeros([3])
stds = paddle.zeros([3])
for img, _ in train_dataset:
    for d in range(3):
        means[d] += img[d,:,:].numpy().mean()
        stds[d] += img[d,:,:].numpy().std()
means = means.numpy()/len(train_dataset)
stds = stds.numpy()/len(train_dataset)
print('均值: {}'.format(means)) # [0.491401   0.4821591  0.44653094]
print('标准差: {}'.format(stds)) # [0.20220289 0.1993163  0.20086345]


## 3、自定义数据集
* 定义继承自paddle.io.Dataset的数据类
* 初始化函数__init__里需要用mode区分训练/验证/测试
* 初始化函数__init__里需要定义数据增广方法
* 编写根据索引idx获取单个数据的逻辑(__getitem__)
* 编写获取数据集长度的函数(__len__)

In [None]:
# 3 自定义数据集(自己定义数据集) 与上面的构建dataset等价
class MyDateset(paddle.io.Dataset):
    def __init__(self, mode='train'):
        super(MyDateset, self).__init__()

        # 3.1 加载原始数据，并定义数据预处理transform
        stats = ((0.491401, 0.4821591, 0.44653094), (0.20220289, 0.1993163, 0.20086345)) # 这是上面统计到的训练集的均值和标准差
        if mode == 'train':
            self.data = Cifar10(mode='train')
            self.transform = T.Compose([
                T.RandomCrop(32, padding=4), # 随机裁剪
                T.RandomHorizontalFlip(), # 水平翻转
                T.ToTensor(), # 切换图像通道到(CHW)，并数据压缩到(0,1)范围，还有一些特性请查看文档
                T.Normalize(*stats) # 归一化，其他参数与特性查看文档
                # 可以加入其他数据预处理方法
                # （TensorFlow与Pytorch也有类似的数据预处理函数，每个框架不一样，需要查看文档说明）
            ])
        elif mode == 'valid' or mode == 'eval': # 验证集和测试集的数据预处理方法不应包括训练集用到的增广的方法
            self.data = Cifar10(mode='test')
            self.transform = T.Compose([
                T.ToTensor(),
                T.Normalize(*stats)
            ])
        else:
            raise Exception("mode can only be one of ['train', 'valid', 'eval']")
    
    def __getitem__(self, idx): # 按照索引获取一个数据
        image = self.data[idx][0]
        label = self.data[idx][1]
        image = self.transform(image) # 上面是定义数据预处理方法，这里用来处理每个数据
        return image,label

    def __len__(self): # 返回数据集长度
        return len(self.data)

## 4、封装成Dataloader并验证
* 验证自定义数据集
* 查看自定义数据集形状和大小（这里和原始数据查看不重复，每一步都需要验证是否有错误，才能避免多踩坑）
* 封装成Dataloader并查看数据

In [None]:
train_dataset = MyDateset(mode='train')
test_dataset = MyDateset(mode='eval')
print(train_dataset[0]) # ( Tensor(shape=[3, 32, 32], dtype=float32), array(6, dtype=int64) ) 
print(len(train_dataset)) # 50k
print(test_dataset[0]) # ( Tensor(shape=[3, 32, 32], dtype=float32), array(3, dtype=int64) )
print(len(test_dataset)) # 10k

train_loader = paddle.io.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = paddle.io.DataLoader(test_dataset, batch_size=32, shuffle=True)


# 核对一个batch数据
for img, label in train_loader(): # 取一个批次的数据
    print(img.numpy())
    print(label.numpy())
    break

## 5、搭建网络模型
* 初始化需要的组件(__init__)
* 编写数据流向，前向推理(forward)
* 核对网络模型和参数

In [None]:
# 5 搭建网络模型
class MyModel(Layer):
    def __init__(self,  num_classes=1):
        super(MyModel, self).__init__()
        self.conv1 = Conv2D(in_channels=3, out_channels=32, kernel_size=3, stride=1) # 30*30
        self.relu1 = paddle.nn.ReLU()
        self.max_pool1 = MaxPool2D(kernel_size=2, stride=2) # 15*15

        self.conv2 = Conv2D(in_channels=32, out_channels=64, kernel_size=3, stride=1) # 13*13
        self.relu2 = paddle.nn.ReLU()
        self.max_pool2 = MaxPool2D(kernel_size=2, stride=2) # 6*6

        self.conv3 = Conv2D(in_channels=64, out_channels=64, kernel_size=3, stride=1) # 4*4
        self.relu3 = paddle.nn.ReLU()

        self.linear1 = Linear(in_features=1024, out_features=64)
        self.relu4 = paddle.nn.ReLU()
        self.linear2 = Linear(in_features=64, out_features=num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.max_pool1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.max_pool2(x)

        x = self.conv3(x)
        x = self.relu3(x)

        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
        x = self.linear1(x)
        x = self.relu4(x)
        x = self.linear2(x)
        return x

inputs = InputSpec([None, 3*32*32], 'float32', 'x')
labels = InputSpec([None, 10], 'float32', 'x')
model = paddle.Model(MyModel(num_classes=10), inputs, labels)

# 模型可视化
model.summary((-1,3,32,32))

## 6、定义训练流程
* 数据载入
* epoch循环体
* 保存评价指标，方便可视化训练
* 边训练边验证
* 保存中间模型

In [None]:
# 6 定义训练流程
def train( # 根据fit定制
    model, 
    train_dataset, 
    test_dataset, 
    optimizer,
    loss,
    metric,
    epochs=1, 
    batch_size=1,
    save_dir=None, # 是否保存模型
    save_freq=1, # 保存频率，单位epoch
    verbose=0, # 是否逐行打印输出
    log_freq=200, # 打印日志的频率
    suffle=True 
    ):

    # 是否构建dataloader取决于传入的是否是Dataloader还是Dataset
    if(isinstance(train_dataset,paddle.io.DataLoader)==False):
        train_loader = paddle.io.DataLoader(train_dataset, batch_size=batch_size, shuffle=suffle)
        test_loader = paddle.io.DataLoader(test_dataset, batch_size=batch_size, shuffle=suffle)

    if verbose == 1:
        log_freq = 1
    print('start training ... ')
    # 训练模式
    model.train()

    train_iter = 0
    test_iter = 0
    for epoch in range(epochs):
        for batch_id, data in enumerate(train_loader()):
            x_data = data[0]
            y_data = data[1]

            logits = model(x_data)

            # 计算评价指标
            correct = metric.compute(logits, y_data)
            metric.update(correct)
            acc = metric.accumulate() # 这里是累积的，返回的是平均的acc，格式list
            
            l = loss(logits, y_data)
            avg_loss = paddle.mean(l)

            log_writer.add_scalar(tag = 'train/loss', step = train_iter, value = avg_loss.numpy()[0])
            # log_writer.add_scalar(tag = 'top1 acc', step = iter, value = acc[0])
            # log_writer.add_scalar(tag = 'top5 acc', step = iter, value = acc[1])

            if batch_id % log_freq == 0:
                print("[train] epoch: {}, batch_id: {}, loss is: {:.4f}, top1 acc: {:.4f}, top5 acc: {:.4f}".format(epoch, batch_id, avg_loss.numpy()[0], acc[0], acc[1]))
                # log_writer.add_scalar(tag = 'loss', step = iter, value = avg_loss.numpy())
                log_writer.add_scalar(tag = 'train/top1_acc', step = train_iter, value = acc[0])
                log_writer.add_scalar(tag = 'train/top5_acc', step = train_iter, value = acc[1])
                metric.reset() # 训练时每输出一次更新一次acc
            l.backward()
            optimizer.step()
            optimizer.clear_grad()
            train_iter += 1
        
        metric.reset() # 避免有累积
        # 每轮后验证一下模型效果
        model.eval() # 修改为评估模式
        losses = []
        for batch_id, data in enumerate(test_loader()):
            x_data = data[0]
            y_data = data[1]

            logits = model(x_data)

            # 计算评价指标
            correct = metric.compute(logits, y_data)
            metric.update(correct)
            acc = metric.accumulate() # 这里是累积的，后面需要平均一下
            
            l = loss(logits, y_data)
            avg_loss = paddle.mean(l)

            losses.append(l.numpy())
            log_writer.add_scalar(tag = 'eval/loss', step = test_iter, value = avg_loss.numpy())
            # log_writer.add_scalar(tag = 'top1 acc', step = iter, value = acc[0])
            # log_writer.add_scalar(tag = 'top5 acc', step = iter, value = acc[1])
            test_iter += 1

        log_writer.add_scalar(tag = 'eval/top1_acc', step = epoch, value = acc[0])
        log_writer.add_scalar(tag = 'eval/top5_acc', step = epoch, value = acc[1])
            
        avg_loss = np.mean(losses)
        # print(avg_loss)
        print("[test] epoch: {}, loss is: {:.4f}, top1 acc: {:.4f}, top5 acc: {:.4f}".format(epoch, avg_loss, acc[0], acc[1]))
        metric.reset() # 避免有累积

        # 保存模型
        if save_dir is not None:
            if epoch+1 == epochs:
                paddle.save(model.state_dict(), '{}/{}.pdparams'.format(SAVE_DIR, 'final'))
                paddle.save(optimizer.state_dict(), '{}/{}.pdopt'.format(SAVE_DIR, 'final'))
                print("epoch {}: Model has been saved in {}.".format('final', SAVE_DIR))
            if epoch % save_freq == 0:
                paddle.save(model.state_dict(), '{}/{}.pdparams'.format(SAVE_DIR, epoch+1))
                paddle.save(optimizer.state_dict(), '{}/{}.pdopt'.format(SAVE_DIR, epoch+1))
                print("epoch {}: Model has been saved in {}.".format(epoch+1, SAVE_DIR))
        model.train() # 转回train模式

## 7、编写主函数
* 定义优化器、损失函数和评价指标
* 初始化参数，并开始训练

In [None]:
# 主函数入口
# if __name__ == '__main__': # 程序入口

# 用于模型保存和可视化参数的路径（交互条件下__file__失效）。写入绝对路径
# filepath, filename = os.path.split(os.path.realpath(__file__))
# stem, suffix = os.path.splitext(filename) # filename .py
# filepath = '/home/aistudio/work' # 写自己的绝对路径或者使用相对路径
filepath = './work' # 使用相对路径
stem = 'cifar10'
SAVE_DIR = '{}/model/{}'.format(filepath, stem)
visualdl = paddle.callbacks.VisualDL(log_dir='{}/visualdl_log/{}'.format(filepath, stem))
log_writer = LogWriter(logdir='{}/visualdl_log/{}'.format(filepath, stem))
# print(SAVE_DIR)

# 加载数据
train_dataset = MyDateset(mode='train')
test_dataset = MyDateset(mode='eval')

# 模型初始化
model = MyModel(num_classes=10)

# 定义优化器、损失函数和评价指标
scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.01, factor=0.5, patience=5, verbose=True)
t_optimizer = paddle.optimizer.Momentum(learning_rate=scheduler, momentum=0.9, parameters=model.parameters(), weight_decay=0.001) # Momentum收敛快
t_loss = paddle.nn.CrossEntropyLoss()
t_metric = paddle.metric.Accuracy(topk=(1, 5))

# 自己增加epoch来观察Acc能到多少，可以改变batch_size，优化器optimizer，损失函数loss以及评价方式metric
# 这个训练器可以自己封装一下，以后直接调用，需要自定义也只需要改很少的部分
train(model, train_dataset, test_dataset, epochs=3, batch_size=32, optimizer=t_optimizer, loss=t_loss, metric=t_metric, save_dir=SAVE_DIR, save_freq=1)

## 8、可视化
由于BML自带可视化工具，直接可以用log来可视化
自己在本地用pip安装visualdl工具
然后新开一个终端，用下面命令开启可视化服务
```bash
visualdl --logdir ./work/visualdl_log/cifar10 --port 8080
```
在打开浏览器，访问http://localhost:8080