# TensorBoard 

当我们在训练一个模型时，我们往往会在训练过程中加一些日志打印，包括训练损失，验证集上的精度等，这些日志可以让我们了解模型训练的一些进展。`TensorBoard`是一个可以让这个事情做的更好的工具，Pytorch中含有成了`TensorBoard`，可以让我们对多种数据进行可视化，以及对网络的训练过程进行跟踪。

TensorBorad可以做哪些事：

* Tracking and visualizing metrics such as loss and accuracy
* Visualizing the model graph (ops and layers)
* Viewing histograms of weights, biases, or other tensors as they change over time
* Projecting embeddings to a lower dimensional space
* Displaying images, text, and audio data
* Profiling programs

TensorBorad支持的时序数据类型有：

* Scalar
* Images,Video,Audio,Text
* Histgram
* Embedding Features
* Hyper Parameters

TensorBoard相关的参考链接：

1. [Pytorch Tensorborad接口说明](https://pytorch.org/docs/stable/tensorboard.html)
2. [Pytorch Tutorials: Visualization models, data, and training with tensorboard](https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html)
3. [Pytorch Tutorials: Pytorch Profiler with tensorboard](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html)

In [1]:
import torch
import matplotlib.pyplot as plt
import torchvision.datasets
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision import models

from torch.utils.tensorboard import SummaryWriter

## 创建Tensorboard Writer

In [2]:
writer = SummaryWriter("runs/tensorboard_tutor")

## Data Visualization

In [3]:
transform = transforms.Compose(
    [transforms.RandomResizedCrop((224, 224)), transforms.ToTensor()]
)

flower_dataset = torchvision.datasets.ImageFolder("../data/flowers/", transform)

train_data, test_data = torch.utils.data.random_split(flower_dataset, [0.8, 0.2])

train_iter = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)

test_iter = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

In [4]:
images, labels = next(iter(train_iter))
img_grid = torchvision.utils.make_grid(images)
writer.add_image("flower_dataset_images", img_grid)

figure = plt.figure(figsize=(8, 4))
cols, rows = 4, 2
for i in range(cols * rows):
    img, label = images[i].permute(1, 2, 0), labels[i]
    ax = figure.add_subplot(rows, cols, i + 1)
    ax.set_title(flower_dataset.classes[label])
    ax.set_axis_off()  # 不显示x,y坐标轴上的刻度
    ax.imshow(img)

writer.add_figure("images_with_label", figure)

<div class="wy-nav-content-img">
    <img src="assets/Tensorboard_dataset_images.png" width="800px" alt="通过 `add_image` 在 TensorBaord 上展示的图片">
    <p>通过 `add_image` 在 TensorBaord 上展示的图片 </p>
</div>

<div class="wy-nav-content-img">
    <img src="assets/Tensorboard_images_with_label.png" width="800px" alt="通过 `add_figure` 在 TensorBaord 上展示的图片">
    <p>通过 `add_figure` 在 TensorBaord 上展示的图片 </p>
</div>

## 模型可视化

In [5]:
def get_model(num_classes):
    model = torchvision.models.resnet18()
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    nn.init.xavier_uniform_(model.fc.weight)
    return model


model = get_model(len(flower_dataset.classes))
writer.add_graph(model, images)
writer.close()

<div class="wy-nav-content-img">
    <img src="assets/Tensorboard_model_graph.png" width="400px" alt="在 TensorBoard 上可以查看整个模型的结构">
    <p> 在 TensorBoard 上可以查看整个模型的结构 </p>
</div>

## 训练过程可视化

In [6]:
def epoch_train(
    data_loader, model, optimizer, criterion, device, step, writer, classes
):
    losses = []
    accuracies = []
    model.train()
    for batch_idx, (data, targets) in enumerate(data_loader):
        data = data.to(device)
        targets = targets.to(device)

        scores = model(data)
        loss = criterion(scores, targets)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        predictions = scores.argmax(dim=1)
        predict_labels = [classes[label] for label in predictions]
        num_correct = (predictions == targets).sum()
        train_acc = num_correct / float(data.size(0))
        accuracies.append(train_acc)

        img_grid = torchvision.utils.make_grid(data)

        writer.add_image("flower_images", img_grid)

        writer.add_scalar("Training Loss", loss.item(), global_step=step)
        writer.add_scalar("Training Accuracy", train_acc, global_step=step)

        writer.add_histogram("model.fc", model.fc.weight)

        features = data.reshape(data.shape[0], -1)
        if batch_idx == 10:
            writer.add_embedding(
                features, metadata=predict_labels, label_img=data, global_step=step
            )

        step += 1
    return step, losses, accuracies


def evaluate(data_loader, model, device):
    model.eval()
    accuracies = []

    for data, targets in data_loader:
        data = data.to(device)
        targets = targets.to(device)
        with torch.no_grad():
            scores = model(data)
            predictions = scores.argmax(dim=1)
            num_correct = (predictions == targets).sum()
            accuracies.append(num_correct / float(data.size(0)))
    return sum(accuracies) / len(accuracies)


<div class="wy-nav-content-img">
    <img src="assets/Tensorboard_hist.png" width="500px" alt="模型 FC 层权重的数值分布随着训练 step 的变化">
    <p> 模型 FC 层权重的数值分布随着训练 step 的变化 </p>
</div>

In [None]:
device = torch.device("cuda")

num_epochs = 5
batch_sizes = [32, 64, 128]
learning_rates = [0.1, 0.3, 0.01, 0.03]

for batch_size in batch_sizes:
    for learning_rate in learning_rates:
        step = 0
        model = get_model(len(flower_dataset.classes))
        model = model.to(device)
        criterion = nn.CrossEntropyLoss()
        train_iter = DataLoader(train_data, batch_size, True)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # 为每一组超参数，创建对应在的 SummaryWriter
        writer = SummaryWriter(
            f"runs/tensorboard_tutor/BatchSize {batch_size}, LR {learning_rate}"
        )

        for epoch in range(num_epochs):
            step, losses, accuracies = epoch_train(
                train_iter,
                model,
                optimizer,
                criterion,
                device,
                step,
                writer,
                flower_dataset.classes,
            )

            val_acc = evaluate(test_iter, model, device)

            writer.add_scalar("Validation Accuracy", val_acc, global_step=epoch)

        # 本组实验结束后，将对应的超参数与训练的指标写入
        writer.add_hparams(
            {"lr": learning_rate, "bsize": batch_size},
            {
                "accuracy": sum(accuracies) / len(accuracies),
                "loss": sum(losses) / len(losses),
            },
        )

<div class="wy-nav-content-img">
    <img src="assets/Tensorboard_hparams.png" width="1000px" alt="超参的实验记录对比">
    <p> 超参的实验记录对比 </p>
</div>

<div class="wy-nav-content-img">
    <img src="assets/Tensorboard_parallel_coord_view.png" width="800px" alt="不同超参数下对应的测试指标的平行坐标视图">
    <p> 不同超参数下对应的测试指标的平行坐标视图 </p>
</div>

## 分享训练过程

```
tensorboard dev upload --logdir runs --name "Flower dataset classification experiment" 
```