In [1]:
import torch 
from d2l import torch as d2l
from IPython import display

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100.0%


Extracting ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100.0%


Extracting ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100.0%


Extracting ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100.0%

Extracting ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw






In [2]:
num_inputs = 784
num_outputs = 10

W = torch.normal(0, 0.01, size=(num_inputs, num_outputs), requires_grad=True)
b = torch.zeros(num_outputs, requires_grad=True)

In [6]:
X  = torch.arange(1, 7, dtype=float)
X

tensor([1., 2., 3., 4., 5., 6.], dtype=torch.float64)

In [15]:
X = X.reshape(2, 3)
X, X.sum(0, keepdim=True), X.sum(1, keepdim=True)

(tensor([[1., 2., 3.],
         [4., 5., 6.]], dtype=torch.float64),
 tensor([[5., 7., 9.]], dtype=torch.float64),
 tensor([[ 6.],
         [15.]], dtype=torch.float64))

In [16]:
def softmax(X):
  X_exp = torch.exp(X)
  partition = X_exp.sum(1, keepdim=True)
  return X_exp / partition # 应用了广播机制

In [17]:
Y = torch.normal(0, 1, (2, 5))
Y_prob = softmax(Y)
Y_prob

tensor([[0.0746, 0.6207, 0.1210, 0.1177, 0.0660],
        [0.1424, 0.2357, 0.2937, 0.1350, 0.1931]])

In [18]:
Y_prob.sum(1)

tensor([1.0000, 1.0000])

In [19]:
def net(X):
  return softmax(torch.matmul(X.reshape(-1, W.shape[0]), W) + b)

In [24]:
y = torch.tensor([0, 2])
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.2, 0.3, 0.5]])
y_hat[[0, 1], y]

tensor([0.1000, 0.5000])

## 交叉熵损失

- 交叉熵通常用来衡量两个概率的区别
$$
H(\mathbf{p,q}) = \sum_i-p_ilog(q_i)
$$

- 将交叉熵作为损失，由于真实$y$中只有一个变量为1，所以我们的损失结构也就是对正确类的概率估计
$$
l(\mathbf{y},\hat{\mathbf{y}}) = -\sum_iy_i\log\hat{y_i} = -\log\hat{y_y}
$$

- 其梯度是真实概率和预测概率的区别
$$
\frac{\partial l(\mathbf{y},\hat{\mathbf{y}})}{\partial o_i} = \text{softmax}(o_i) - y_i
$$

In [29]:
def cross_entropy(y_hat, y):
  # 其中len的值就是样本批次
  # y中存储的就是对应的真实类别（序号）
  # y_hat中存储的多个行向量
  # 每个行向量对应一个批次，都是一个概率分布
  # 行向量中的元素对应一个概率
  return -torch.log(y_hat[range(len(y_hat)), y])

cross_entropy(y_hat, y)

tensor([2.3026, 0.6931])

In [31]:
def accuracy(y_hat, y):
  if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
    # 将每个行向量中最大值（即预测的类别）的下标返回
    y_hat = y_hat.argmax(axis=1)
  cmp = y_hat.type(y.dtype) == y
  return float(cmp.type(y.dtype).sum())


accuracy(y_hat, y) / len(y_hat)

0.5

In [32]:
class Accumulator:
  # 在n个变量上累加
  def __init__(self, n):
    # 初始化n个累加器，每个累加器初始值为0
    self.data = [0.0] * n
  
  def add(self, *args):
    # 将传入的参数依次累加到对应的累加器上
    self.data = [a + float(b) for a, b in zip(self.data, args)]
  
  def reset(self):
    # 重置所有累加器为0
    self.data = [0.0] * len(self.data)
  
  def __getitem__(self, idx):
    # 获取第idx个累加器的值
    return self.data[idx]

def evaluate_accuracy(net, data_iter):
  # 如果net是torch.nn.Module的实例，则设置为评估模式
  if isinstance(net, torch.nn.Module):
    net.eval()
  # 创建一个Accumulator实例，用于累加正确预测的数量和总样本数量
  metric = Accumulator(2)
  for X, y in data_iter:
    # 累加正确预测的数量和总样本数量
    metric.add(accuracy(net(X), y), y.numel())
  # 返回正确预测的比例
  return metric[0] / metric[1]
