In [1]:
import time
import torch
import torch.nn as nn 
import torch.nn.functional as F 

import d2lzh as d2l 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Residual(nn.Module):
    def __init__(self,in_channels,out_channels,use_1x1conv=False,stride=1):
        super(Residual,self).__init__()

        self.conv1 = nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1,stride=stride)
        self.conv2 = nn.Conv2d(out_channels,out_channels,kernel_size=3,padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=stride)
        else:
            self.conv3 = None

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self,x):
        y = F.relu(self.bn1(self.conv1(x)))
        y = self.bn2(self.conv2(y))

        if self.conv3:
            x = self.conv3(x)

        return F.relu(y + x)

blk = Residual(3,3)
x = torch.rand((4,3,6,6))
blk(x).shape

torch.Size([4, 3, 6, 6])

In [3]:
net = nn.Sequential(
    nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3,stride=2,padding=1) 
)

def resnet_block(in_channels,out_channels,num_residuals,first_block=False):
    if first_block:
        assert in_channels == out_channels

    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(in_channels,out_channels,use_1x1conv=True,stride=2))
        else:
            blk.append(Residual(out_channels,out_channels))
    return nn.Sequential(*blk)

net.add_module("residual_block1",resnet_block(64,64,2,first_block=True))
net.add_module("residual_block2",resnet_block(64,128,2))
net.add_module("residual_block3",resnet_block(128,256,2))
net.add_module("residual_block4",resnet_block(256,512,2))

net.add_module("global_avg_pool",d2l.GlobalAvgPool2d())
net.add_module("fc",nn.Sequential(d2l.FlattenLayer(),nn.Linear(512,10)))

x = torch.rand(1,1,224,224)
for name,layer in net.named_children():
    x = layer(x)
    print(name," output shape:\t",x.shape)

0  output shape:	 torch.Size([1, 64, 112, 112])
1  output shape:	 torch.Size([1, 64, 112, 112])
2  output shape:	 torch.Size([1, 64, 112, 112])
3  output shape:	 torch.Size([1, 64, 56, 56])
residual_block1  output shape:	 torch.Size([1, 64, 56, 56])
residual_block2  output shape:	 torch.Size([1, 128, 28, 28])
residual_block3  output shape:	 torch.Size([1, 256, 14, 14])
residual_block4  output shape:	 torch.Size([1, 512, 7, 7])
global_avg_pool  output shape:	 torch.Size([1, 512, 1, 1])
fc  output shape:	 torch.Size([1, 10])


In [4]:
batch_size = 256
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size)

lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1, loss 0.4373, train acc 0.839,test acc 0.869,time 29.4
epoch 2, loss 0.1500, train acc 0.889,test acc 0.848,time 23.7
epoch 3, loss 0.0856, train acc 0.905,test acc 0.894,time 23.8
epoch 4, loss 0.0580, train acc 0.913,test acc 0.848,time 23.9
epoch 5, loss 0.0417, train acc 0.922,test acc 0.878,time 23.9
