# **卷积神经网络进阶——Lenet5和ResNet**

## 1. Lenet

LeNet5 [1] 诞生于 1994 年，是最早的卷积神经网络之一，并且推动了深度学习领域的发展。

核心思想:图像的特征分布在整张图像上，带有可学习参数的卷积可以用少量的参数在多个位置上有效地提取相似特征。和将所有像素作为一个大型多层神经网络的单独输入不同, LeNet5认为图像数据具有很强的空间相关性，如果使用图像中独立的像素作为输入特征则无法利用时空相关性。

[1] LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.

### 1.1 Lenet5网络结构展示

<img src="./picture/lenet5_1.png" width=800>

LeNet-5是一个较简单的卷积神经网络。下图显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。

输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

lenet = LeNet()
lenet

LeNet(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2_drop): Dropout2d(p=0.5, inplace=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

### 1.2 准备MNIST数据集

In [25]:
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt

# Hyper Parameters
EPOCH = 1           
BATCH_SIZE = 50
LR = 0.001 
DOWNLOAD_MNIST = True

# Mnist 手写数字
train_data = torchvision.datasets.MNIST(
    root='./data/', 
    train=True,  # this is training data
    transform=torchvision.transforms.ToTensor(),
    download=DOWNLOAD_MNIST,
)

test_data = torchvision.datasets.MNIST(root='./data/', train=False)

# 批训练 50samples, 1 channel, 28x28 (50, 1, 28, 28)
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

# 测试数据集
test_x = torch.unsqueeze(test_data.data, dim=1).type(torch.FloatTensor)/255. 
test_y = test_data.targets

### 1.3 选择优化方式和定义损失函数

In [22]:
optimizer = torch.optim.Adam(lenet.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

### 1.4 模型训练

In [23]:
# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = lenet(b_x)
        loss = loss_func(output, b_y)
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step()
        
        if step % 200 == 0:
            print(step, "\\ 1200: ", loss)
        
        if step == len(train_loader) - 1:
            print(1200, "\\ 1200: ", loss)

0 \ 1200:  tensor(2.3146, grad_fn=<NllLossBackward>)
200 \ 1200:  tensor(0.7246, grad_fn=<NllLossBackward>)
400 \ 1200:  tensor(0.4504, grad_fn=<NllLossBackward>)
600 \ 1200:  tensor(0.4945, grad_fn=<NllLossBackward>)
800 \ 1200:  tensor(0.2684, grad_fn=<NllLossBackward>)
1000 \ 1200:  tensor(0.3877, grad_fn=<NllLossBackward>)
1200 \ 1200:  tensor(0.1223, grad_fn=<NllLossBackward>)


### 1.5 模型测试

In [26]:
test_output = lenet(test_x)
pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
print(pred_y[:10], 'prediction number')
print(test_y[:10].numpy(), 'real number')

accuracy = 0
for i in range(len(test_y)):
    if pred_y[i] == test_y[i]:
        accuracy += 1
accuracy /= len(test_y)
print("The accuracy: ", accuracy)

[7 2 1 0 4 1 4 9 5 9] prediction number
[7 2 1 0 4 1 4 9 5 9] real number
The accuracy:  0.9148


## 2. 模型调参

In [101]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt

# Mnist 手写数字
train_data = torchvision.datasets.MNIST(
    root='./data/', 
    train=True,  # this is training data
    transform=torchvision.transforms.ToTensor(),   
    download=False,
)

# 测试数据集
test_data = torchvision.datasets.MNIST(root='./data/', train=False)
test_x = torch.unsqueeze(test_data.data, dim=1).type(torch.FloatTensor)/255. 
test_y = test_data.targets

def get_accuracy(pred_y, test_y):
    accuracy = 0
    for i in range(len(test_y)):
        if pred_y[i] == test_y[i]:
            accuracy += 1
    accuracy /= len(test_y)
    return accuracy
    

### 2.1 模型整合

In [102]:
EPOCH = 1     
LR = 0.001
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
            
    if epoch % 5 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

0 \ 469: 2.3161559104919434
100 \ 469: 0.8792778849601746
200 \ 469: 0.5495938658714294
300 \ 469: 0.6191794276237488
400 \ 469: 0.32914307713508606
468 \ 469: 0.6522848010063171
The accuracy:  0.8816


### 2.2 学习率的影响(lr = 0.5, 0.1, 0.01, 0.001, 0.0001)

In [50]:
EPOCH = 1     
LR = 0.5
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}".format(LR))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
            
    if epoch % 5 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.5
0 \ 469: 2.294623374938965
100 \ 469: 2.305206060409546
200 \ 469: 2.293544292449951
300 \ 469: 2.3251876831054688
400 \ 469: 2.2903807163238525
468 \ 469: 2.3408987522125244
The accuracy:  0.1032


In [51]:
EPOCH = 1     
LR = 0.1
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}".format(LR))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
            
    if epoch % 5 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.1
0 \ 469: 2.3095030784606934
100 \ 469: 2.318173885345459
200 \ 469: 2.3053905963897705
300 \ 469: 2.293431282043457
400 \ 469: 2.3145930767059326
468 \ 469: 2.293591260910034
The accuracy:  0.1135


In [52]:
EPOCH = 1     
LR = 0.01
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}".format(LR))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
            
    if epoch % 5 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.01
0 \ 469: 2.3097434043884277
100 \ 469: 0.5020248889923096
200 \ 469: 0.4101569950580597
300 \ 469: 0.43195873498916626
400 \ 469: 0.33610889315605164
468 \ 469: 0.39019644260406494
The accuracy:  0.9141


In [53]:
EPOCH = 1     
LR = 0.001
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}".format(LR))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
            
    if epoch % 5 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.001
0 \ 469: 2.3091623783111572
100 \ 469: 0.8119509816169739
200 \ 469: 0.5555205941200256
300 \ 469: 0.49830445647239685
400 \ 469: 0.27350348234176636
468 \ 469: 0.44896045327186584
The accuracy:  0.9019


In [54]:
EPOCH = 1     
LR = 0.0001
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("{} \\ {}: {}".format(step, len(train_loader), loss))
            
    if epoch % 5 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

0 \ 469: 2.3173182010650635
100 \ 469: 2.2582955360412598
200 \ 469: 2.012465715408325
300 \ 469: 1.549019455909729
400 \ 469: 1.2246626615524292
468 \ 469: 1.1059397459030151
The accuracy:  0.6777


### 2.3 算法收敛(Epoch) (LR=0.0001)

In [57]:
EPOCH = 5     
LR = 0.0001
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

epoch 0, batch 0 \ 469: 2.31659197807312
epoch 0, batch 100 \ 469: 2.2423763275146484
epoch 0, batch 200 \ 469: 1.9429116249084473
epoch 0, batch 300 \ 469: 1.4636590480804443
epoch 0, batch 400 \ 469: 1.0838598012924194
epoch 0, batch 468 \ 469: 0.9720259308815002
The accuracy:  0.6928
epoch 1, batch 0 \ 469: 0.8370915651321411
epoch 1, batch 100 \ 469: 0.9007120728492737
epoch 1, batch 200 \ 469: 0.8205913305282593
epoch 1, batch 300 \ 469: 0.8055069446563721
epoch 1, batch 400 \ 469: 0.6228863596916199
epoch 1, batch 468 \ 469: 0.8032221794128418
The accuracy:  0.7974
epoch 2, batch 0 \ 469: 0.752005398273468
epoch 2, batch 100 \ 469: 0.5976746082305908
epoch 2, batch 200 \ 469: 0.7879090309143066
epoch 2, batch 300 \ 469: 0.5923303365707397
epoch 2, batch 400 \ 469: 0.5047240257263184
epoch 2, batch 468 \ 469: 0.4320560395717621
The accuracy:  0.835
epoch 3, batch 0 \ 469: 0.6913126111030579
epoch 3, batch 100 \ 469: 0.609498143196106
epoch 3, batch 200 \ 469: 0.4541616141796112
ep

In [58]:
# training and testing
for epoch in range(5, 5+EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)

epoch 5, batch 0 \ 469: 0.5440018177032471
epoch 5, batch 100 \ 469: 0.3162383735179901
epoch 5, batch 200 \ 469: 0.3551766574382782
epoch 5, batch 300 \ 469: 0.41187551617622375
epoch 5, batch 400 \ 469: 0.37308067083358765
epoch 5, batch 468 \ 469: 0.3674497902393341
The accuracy:  0.8833
epoch 6, batch 0 \ 469: 0.38901907205581665
epoch 6, batch 100 \ 469: 0.3291124999523163
epoch 6, batch 200 \ 469: 0.33019712567329407
epoch 6, batch 300 \ 469: 0.5645003914833069
epoch 6, batch 400 \ 469: 0.3658604919910431
epoch 6, batch 468 \ 469: 0.2446144074201584
The accuracy:  0.8985
epoch 7, batch 0 \ 469: 0.3567560613155365
epoch 7, batch 100 \ 469: 0.44791051745414734
epoch 7, batch 200 \ 469: 0.27363121509552
epoch 7, batch 300 \ 469: 0.3486737906932831
epoch 7, batch 400 \ 469: 0.36492839455604553
epoch 7, batch 468 \ 469: 0.31886303424835205
The accuracy:  0.8946
epoch 8, batch 0 \ 469: 0.2899123728275299
epoch 8, batch 100 \ 469: 0.3328332304954529
epoch 8, batch 200 \ 469: 0.437745571

In [59]:
# training and testing
for epoch in range(10, 10+EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)

epoch 10, batch 0 \ 469: 0.2332538366317749
epoch 10, batch 100 \ 469: 0.31788504123687744
epoch 10, batch 200 \ 469: 0.2959536612033844
epoch 10, batch 300 \ 469: 0.3131331503391266
epoch 10, batch 400 \ 469: 0.34628593921661377
epoch 10, batch 468 \ 469: 0.4246891438961029
The accuracy:  0.912
epoch 11, batch 0 \ 469: 0.3116169273853302
epoch 11, batch 100 \ 469: 0.2837075889110565
epoch 11, batch 200 \ 469: 0.20258238911628723
epoch 11, batch 300 \ 469: 0.3190847337245941
epoch 11, batch 400 \ 469: 0.20166929066181183
epoch 11, batch 468 \ 469: 0.1830863207578659
The accuracy:  0.9105
epoch 12, batch 0 \ 469: 0.39048275351524353
epoch 12, batch 100 \ 469: 0.47522982954978943
epoch 12, batch 200 \ 469: 0.25806066393852234
epoch 12, batch 300 \ 469: 0.31582459807395935
epoch 12, batch 400 \ 469: 0.4334928095340729
epoch 12, batch 468 \ 469: 0.28257814049720764
The accuracy:  0.9197
epoch 13, batch 0 \ 469: 0.2330884486436844
epoch 13, batch 100 \ 469: 0.2094266265630722
epoch 13, batc

#### 2.3.1 测试不同学习率的收敛情况 (LR=0.01)

In [61]:
EPOCH = 10     
LR = 0.01
BATCH_SIZE = 128

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

epoch 0, batch 0 \ 469: 2.3020200729370117
epoch 0, batch 100 \ 469: 0.5564085841178894
epoch 0, batch 200 \ 469: 0.4021049737930298
epoch 0, batch 300 \ 469: 0.39024990797042847
epoch 0, batch 400 \ 469: 0.4447093904018402
epoch 0, batch 468 \ 469: 0.33565524220466614
The accuracy:  0.9085
epoch 1, batch 0 \ 469: 0.25280675292015076
epoch 1, batch 100 \ 469: 0.3594478368759155
epoch 1, batch 200 \ 469: 0.3745402693748474
epoch 1, batch 300 \ 469: 0.20888279378414154
epoch 1, batch 400 \ 469: 0.19976621866226196
epoch 1, batch 468 \ 469: 0.3292265236377716
The accuracy:  0.9251
epoch 2, batch 0 \ 469: 0.17992641031742096
epoch 2, batch 100 \ 469: 0.2013881802558899
epoch 2, batch 200 \ 469: 0.22156307101249695
epoch 2, batch 300 \ 469: 0.19721288979053497
epoch 2, batch 400 \ 469: 0.13821081817150116
epoch 2, batch 468 \ 469: 0.23379887640476227
The accuracy:  0.9294
epoch 3, batch 0 \ 469: 0.22159135341644287
epoch 3, batch 100 \ 469: 0.12662054598331451
epoch 3, batch 200 \ 469: 0.18

### 2.4 Batch Size的影响

In [63]:
EPOCH = 5    
LR = 0.01
BATCH_SIZE = 50

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.01, Batch_Size: 50
epoch 0, batch 0 \ 1200: 2.3033225536346436
epoch 0, batch 100 \ 1200: 0.4699312448501587
epoch 0, batch 200 \ 1200: 0.3083084523677826
epoch 0, batch 300 \ 1200: 0.6657001972198486
epoch 0, batch 400 \ 1200: 0.7186720967292786
epoch 0, batch 500 \ 1200: 0.4065611660480499
epoch 0, batch 600 \ 1200: 0.46238765120506287
epoch 0, batch 700 \ 1200: 0.21865147352218628
epoch 0, batch 800 \ 1200: 0.258233904838562
epoch 0, batch 900 \ 1200: 0.2459692358970642
epoch 0, batch 1000 \ 1200: 0.3176041543483734
epoch 0, batch 1100 \ 1200: 0.20567651093006134
epoch 0, batch 1199 \ 1200: 0.2819063663482666
The accuracy:  0.8879
epoch 1, batch 0 \ 1200: 0.4628733694553375
epoch 1, batch 100 \ 1200: 0.29479798674583435
epoch 1, batch 200 \ 1200: 0.78133225440979
epoch 1, batch 300 \ 1200: 0.43220385909080505
epoch 1, batch 400 \ 1200: 0.7156745195388794
epoch 1, batch 500 \ 1200: 0.3404513895511627
epoch 1, batch 600 \ 1200: 0.3325895071029663
epoch 1, batch 700 \

In [64]:
EPOCH = 5    
LR = 0.01
BATCH_SIZE = 500

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.01, Batch_Size: 500
epoch 0, batch 0 \ 120: 2.3072257041931152
epoch 0, batch 100 \ 120: 0.380398690700531
epoch 0, batch 119 \ 120: 0.3800472617149353
The accuracy:  0.905
epoch 1, batch 0 \ 120: 0.30659493803977966
epoch 1, batch 100 \ 120: 0.31294992566108704
epoch 1, batch 119 \ 120: 0.22828170657157898
The accuracy:  0.9295
epoch 2, batch 0 \ 120: 0.2730249762535095
epoch 2, batch 100 \ 120: 0.2213660478591919
epoch 2, batch 119 \ 120: 0.19054743647575378
The accuracy:  0.9393
epoch 3, batch 0 \ 120: 0.20591777563095093
epoch 3, batch 100 \ 120: 0.22176162898540497
epoch 3, batch 119 \ 120: 0.22029085457324982
The accuracy:  0.9474
epoch 4, batch 0 \ 120: 0.21365687251091003
epoch 4, batch 100 \ 120: 0.15967237949371338
epoch 4, batch 119 \ 120: 0.24157488346099854
The accuracy:  0.9425


## Great!

In [68]:
EPOCH = 10
LR = 0.001
BATCH_SIZE = 50

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.001, Batch_Size: 50
epoch 0, batch 0 \ 1200: 2.3056812286376953
epoch 0, batch 100 \ 1200: 0.9485573768615723
epoch 0, batch 200 \ 1200: 0.4968200922012329
epoch 0, batch 300 \ 1200: 0.5669557452201843
epoch 0, batch 400 \ 1200: 0.40853461623191833
epoch 0, batch 500 \ 1200: 0.36985889077186584
epoch 0, batch 600 \ 1200: 0.3702736794948578
epoch 0, batch 700 \ 1200: 0.5365966558456421
epoch 0, batch 800 \ 1200: 0.24757924675941467
epoch 0, batch 900 \ 1200: 0.3024618625640869
epoch 0, batch 1000 \ 1200: 0.2053707093000412
epoch 0, batch 1100 \ 1200: 0.40047574043273926
epoch 0, batch 1199 \ 1200: 0.4163241982460022
The accuracy:  0.9162
epoch 1, batch 0 \ 1200: 0.23867452144622803
epoch 1, batch 100 \ 1200: 0.27230626344680786
epoch 1, batch 200 \ 1200: 0.16738730669021606
epoch 1, batch 300 \ 1200: 0.16350458562374115
epoch 1, batch 400 \ 1200: 0.4388209879398346
epoch 1, batch 500 \ 1200: 0.35524508357048035
epoch 1, batch 600 \ 1200: 0.0945853441953659
epoch 1, bat

In [67]:
EPOCH = 10   
LR = 0.001
BATCH_SIZE = 500

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv2(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Experiment, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

Experiment, LR: 0.001, Batch_Size: 500
epoch 0, batch 0 \ 120: 2.310904026031494
epoch 0, batch 100 \ 120: 0.620497465133667
epoch 0, batch 119 \ 120: 0.5376012921333313
The accuracy:  0.8399
epoch 1, batch 0 \ 120: 0.5342500805854797
epoch 1, batch 100 \ 120: 0.4057137370109558
epoch 1, batch 119 \ 120: 0.43442302942276
The accuracy:  0.8925
epoch 2, batch 0 \ 120: 0.45172548294067383
epoch 2, batch 100 \ 120: 0.30685344338417053
epoch 2, batch 119 \ 120: 0.24363884329795837
The accuracy:  0.9118
epoch 3, batch 0 \ 120: 0.29189205169677734
epoch 3, batch 100 \ 120: 0.2508179843425751
epoch 3, batch 119 \ 120: 0.32550522685050964
The accuracy:  0.9271
epoch 4, batch 0 \ 120: 0.27159178256988525
epoch 4, batch 100 \ 120: 0.2723270654678345
epoch 4, batch 119 \ 120: 0.25161752104759216
The accuracy:  0.9343
epoch 5, batch 0 \ 120: 0.2553897202014923
epoch 5, batch 100 \ 120: 0.26446428894996643
epoch 5, batch 119 \ 120: 0.2410522848367691
The accuracy:  0.9359
epoch 6, batch 0 \ 120: 0.2

### 2.5 加深网络

In [77]:
EPOCH = 10
LR = 0.001
BATCH_SIZE = 50

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class DeepNet(nn.Module):
    def __init__(self):
        super(DeepNet, self).__init__()

        self.convfirst = nn.Conv2d(1, 10, kernel_size=5)
        
        self.conv1 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)

        self.convfinal = nn.Conv2d(10, 20, kernel_size=5)
        self.convfinal_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.convfirst(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        
        x = self.conv3(x)
        x = F.relu(x)
        
        x = self.conv4(x)
        x = F.relu(x)
        
        x = self.conv5(x)
        x = F.relu(x)
        
        x = self.convfinal(x)
        x = F.relu(F.max_pool2d(self.convfinal_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = DeepNet()
print(net)
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Deep+5, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

DeepNet(
  (convfirst): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv1): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (convfinal): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (convfinal_drop): Dropout2d(p=0.5, inplace=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)
Deep+5, LR: 0.001, Batch_Size: 50
epoch 0, batch 0 \ 1200: 2.2912659645080566
epoch 0, batch 100 \ 1200: 1.5827502012252808
epoch 0, batch 200 \ 1200: 1.0588760375976562
epoch 0, batch 300 \ 1200: 0.4336460530757904
epoch 0, batch 400 \ 1200: 0.6408582329750061
epoch 0, batch 500 \ 1200: 0.46886405348

In [78]:
EPOCH = 10
LR = 0.001
BATCH_SIZE = 50

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class DeepNet(nn.Module):
    def __init__(self):
        super(DeepNet, self).__init__()

        self.convfirst = nn.Conv2d(1, 10, kernel_size=5)
        
        self.conv1 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv7 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv8 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv9 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv10 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)

        self.convfinal = nn.Conv2d(10, 20, kernel_size=5)
        self.convfinal_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.convfirst(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        
        x = self.conv3(x)
        x = F.relu(x)
        
        x = self.conv4(x)
        x = F.relu(x)
        
        x = self.conv5(x)
        x = F.relu(x)
        
        x = self.conv6(x)
        x = F.relu(x)
        
        x = self.conv7(x)
        x = F.relu(x)
        
        x = self.conv8(x)
        x = F.relu(x)
        
        x = self.conv9(x)
        x = F.relu(x)
        
        x = self.conv10(x)
        x = F.relu(x)
        
        x = self.convfinal(x)
        x = F.relu(F.max_pool2d(self.convfinal_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = DeepNet()
print(net)
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Deep+10, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

DeepNet(
  (convfirst): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv1): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv7): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv8): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv9): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv10): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (convfinal): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (convfinal_drop): Dropout2d(p=0.5, inplace=False)
  (fc1): Linear(in_features=320, out_fe

In [80]:
EPOCH = 3
LR = 0.001
BATCH_SIZE = 50

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class DeepNet(nn.Module):
    def __init__(self):
        super(DeepNet, self).__init__()

        self.convfirst = nn.Conv2d(1, 10, kernel_size=5)
        
        self.conv1 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv7 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv8 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv9 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv10 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv11 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv12 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv13 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv14 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv15 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv16 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv17 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv18 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv19 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv20 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)

        self.convfinal = nn.Conv2d(10, 20, kernel_size=5)
        self.convfinal_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.convfirst(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        
        x = self.conv3(x)
        x = F.relu(x)
        
        x = self.conv4(x)
        x = F.relu(x)
        
        x = self.conv5(x)
        x = F.relu(x)
        
        x = self.conv6(x)
        x = F.relu(x)
        
        x = self.conv7(x)
        x = F.relu(x)
        
        x = self.conv8(x)
        x = F.relu(x)
        
        x = self.conv9(x)
        x = F.relu(x)
        
        x = self.conv10(x)
        x = F.relu(x)
        
        x = self.conv11(x)
        x = F.relu(x)
        
        x = self.conv12(x)
        x = F.relu(x)
        
        x = self.conv13(x)
        x = F.relu(x)
        
        x = self.conv14(x)
        x = F.relu(x)
        
        x = self.conv15(x)
        x = F.relu(x)
        
        x = self.conv16(x)
        x = F.relu(x)
        
        x = self.conv17(x)
        x = F.relu(x)
        
        x = self.conv18(x)
        x = F.relu(x)
        
        x = self.conv19(x)
        x = F.relu(x)
        
        x = self.conv20(x)
        x = F.relu(x)
        
        x = self.convfinal(x)
        x = F.relu(F.max_pool2d(self.convfinal_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = DeepNet()
print(net)
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Deep+20, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

DeepNet(
  (convfirst): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv1): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv7): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv8): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv9): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv10): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv11): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv12): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))


In [104]:
EPOCH = 1
LR = 0.0001
BATCH_SIZE = 50

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class DeepNet(nn.Module):
    def __init__(self):
        super(DeepNet, self).__init__()

        self.convfirst = nn.Conv2d(1, 10, kernel_size=5)
        
        self.conv1 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv7 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv8 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv9 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv10 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv11 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv12 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv13 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv14 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv15 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv16 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv17 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv18 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv19 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv20 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)

        self.convfinal = nn.Conv2d(10, 20, kernel_size=5)
        self.convfinal_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.convfirst(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        
        x = self.conv3(x)
        x = F.relu(x)
        
        x = self.conv4(x)
        x = F.relu(x)
        
        x = self.conv5(x)
        x = F.relu(x)
        
        x = self.conv6(x)
        x = F.relu(x)
        
        x = self.conv7(x)
        x = F.relu(x)
        
        x = self.conv8(x)
        x = F.relu(x)
        
        x = self.conv9(x)
        x = F.relu(x)
        
        x = self.conv10(x)
        x = F.relu(x)
        
        x = self.conv11(x)
        x = F.relu(x)
        
        x = self.conv12(x)
        x = F.relu(x)
        
        x = self.conv13(x)
        x = F.relu(x)
        
        x = self.conv14(x)
        x = F.relu(x)
        
        x = self.conv15(x)
        x = F.relu(x)
        
        x = self.conv16(x)
        x = F.relu(x)
        
        x = self.conv17(x)
        x = F.relu(x)
        
        x = self.conv18(x)
        x = F.relu(x)
        
        x = self.conv19(x)
        x = F.relu(x)
        
        x = self.conv20(x)
        x = F.relu(x)
        
        x = self.convfinal(x)
        x = F.relu(F.max_pool2d(self.convfinal_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = DeepNet()
print(net)
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Deep+20, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

DeepNet(
  (convfirst): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv1): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv7): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv8): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv9): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv10): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv11): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv12): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))


In [105]:
EPOCH = 1
LR = 0.01
BATCH_SIZE = 50

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = Data.DataLoader(dataset=train_data, batch_size=50, shuffle=True)

class DeepNet(nn.Module):
    def __init__(self):
        super(DeepNet, self).__init__()

        self.convfirst = nn.Conv2d(1, 10, kernel_size=5)
        
        self.conv1 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv7 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv8 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv9 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv10 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv11 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv12 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv13 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv14 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv15 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv16 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv17 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv18 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv19 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)
        self.conv20 = nn.Conv2d(10, 10, kernel_size=3, stride=1, padding=1)

        self.convfinal = nn.Conv2d(10, 20, kernel_size=5)
        self.convfinal_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)


    def forward(self, x):
        x = self.convfirst(x)
        x = F.relu(F.max_pool2d(x, 2))
        
        x = self.conv1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        
        x = self.conv3(x)
        x = F.relu(x)
        
        x = self.conv4(x)
        x = F.relu(x)
        
        x = self.conv5(x)
        x = F.relu(x)
        
        x = self.conv6(x)
        x = F.relu(x)
        
        x = self.conv7(x)
        x = F.relu(x)
        
        x = self.conv8(x)
        x = F.relu(x)
        
        x = self.conv9(x)
        x = F.relu(x)
        
        x = self.conv10(x)
        x = F.relu(x)
        
        x = self.conv11(x)
        x = F.relu(x)
        
        x = self.conv12(x)
        x = F.relu(x)
        
        x = self.conv13(x)
        x = F.relu(x)
        
        x = self.conv14(x)
        x = F.relu(x)
        
        x = self.conv15(x)
        x = F.relu(x)
        
        x = self.conv16(x)
        x = F.relu(x)
        
        x = self.conv17(x)
        x = F.relu(x)
        
        x = self.conv18(x)
        x = F.relu(x)
        
        x = self.conv19(x)
        x = F.relu(x)
        
        x = self.conv20(x)
        x = F.relu(x)
        
        x = self.convfinal(x)
        x = F.relu(F.max_pool2d(self.convfinal_drop(x), 2))

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

net = DeepNet()
print(net)
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
loss_func = nn.CrossEntropyLoss()

print("Deep+20, LR: {}, Batch_Size: {}".format(LR, BATCH_SIZE))

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader): 
        output = net(b_x)
        loss = loss_func(output, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % 100 == 0:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
        
        if step == len(train_loader) - 1:
            print("epoch {}, batch {} \\ {}: {}".format(epoch, step, len(train_loader), loss))
            
    if epoch % 1 == 0:
        test_output = net(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        accuracy = get_accuracy(pred_y, test_y)
        print("The accuracy: ", accuracy)
        

DeepNet(
  (convfirst): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv1): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv7): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv8): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv9): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv10): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv11): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv12): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))


## 3. ResNet

### 深度网络的退化问题
从经验来看，网络的深度对模型的性能至关重要, 当增加网络层数后, 网络可以进行更加复杂的特征模式的提取, 所以当模型更深时理论上可以取得更好的结果. 

实验发现深度网络出现了退化问题(Degradation problem): 网络深度增加时, 网络准确度出现饱和, 甚至出现下降. 这个现象可以在图3中直观看出来: 56层的网络比20层网络效果还要差. 这不会是过拟合问题, 因为56层网络的训练误差同样高. 

<img src="./picture/resnet_1.png" width=800>

深度网络的退化问题至少说明深度网络不容易训练。但是我们考虑这样一个事实：现在有一个浅层网络，通过向上堆积新层来建立深层网络，一个极端情况是这些增加的层什么也不学习，仅仅复制浅层网络的特征，即这样新层是恒等映射（Identity mapping）。

在这种情况下，深层网络应该至少和浅层网络性能一样，也不应该出现退化现象。

这个有趣的假设让何博士灵感爆发，他提出了残差学习来解决退化问题。对于一个堆积层结构（几层堆积而成）当输入为 $x$ 时其学习到的特征记为$H(x)$ ，现在我们希望其可以学习到残差  $F(x)=H(x)-x$，这样其实原始的学习特征是 $F(x)+x$ 。之所以这样是因为残差学习相比原始特征直接学习更容易。当残差为0时，此时堆积层仅仅做了恒等映射，至少网络性能不会下降，实际上残差不会为0，这也会使得堆积层在输入特征基础上学习到新的特征，从而拥有更好的性能

<img src="./picture/resnet_2.png" width=400>

为什么残差学习相对更容易，从直观上看残差学习需要学习的内容少，因为残差一般会比较小，学 习难度小点。不过我们可以从数学的角度来分析这个问题，首先残差单元可以表示为：
$$
\begin{aligned}
&y_{l}=h\left(x_{l}\right)+F\left(x_{l}, W_{l}\right) \\
&x_{l+1}=f\left(y_{l}\right)
\end{aligned}
$$
其中 $x_{l}$ 和 $x_{l+1}$ 分别表示的是第 $l$ 个残差单元的输入和输出，注意每个残差单元一般包含多层结 构。 $F$ 是残差函数， 表示学习到的残差，而 $h\left(x_{l}\right)=x_{l}$ 表示恒等映射, $f$ 是ReLU激活函数。 基于上式, 我们求得从浅层 $l$ 到深层 $L$ 的学习特征为:
$$
x_{L}=x_{l}+\sum_{i=l}^{L-1} F\left(x_{i}, W_{i}\right)
$$
利用链式规则，可以求得反向过程的梯度：
$\frac{\partial l o s s}{\partial x_{l}}=\frac{\partial \operatorname{loss}}{\partial x_{L}} \cdot \frac{\partial x_{L}}{\partial x_{l}}=\frac{\partial l o s s}{\partial x_{L}} \cdot\left(1+\frac{\partial}{\partial x_{L}} \sum_{i=l}^{L-1} F\left(x_{i}, W_{i}\right)\right)$
式子的第一个因子 $\frac{\partial l o s s}{\partial x_{L}}$ 表示的损失函数到达 $L$ 的梯度，小括号中的1表明短路机制可以无损地 传播梯度，而另外一项残差梯度则需要经过带有weights的层，梯度不是直接传递过来的。残差梯 度不会那么巧全为 $-1$ ，而且就算其比较小，有1的存在也不会导致梯度消失。所以残差学习会更容 易。要注意上面的推导并不是严格的证明。

<img src="./picture/resnet_3.jpg" width=1000>

<img src="./picture/resnet_4.png" width=600>

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# 用于ResNet18和34的残差块，用的是2个3x3的卷积
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.shortcut = nn.Sequential()
        # 经过处理后的x要与x的维度相同(尺寸和深度)
        # 如果不相同，需要添加卷积+BN来变换为同一维度
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


# 用于ResNet50,101和152的残差块，用的是1x1+3x3+1x1的卷积
class Bottleneck(nn.Module):
    # 前面1x1和3x3卷积的filter个数相等，最后1x1卷积是其expansion倍
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes,
                               kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2,2,2,2])

def ResNet34():
    return ResNet(BasicBlock, [3,4,6,3])

def ResNet50():
    return ResNet(Bottleneck, [3,4,6,3])

def ResNet101():
    return ResNet(Bottleneck, [3,4,23,3])

def ResNet152():
    return ResNet(Bottleneck, [3,8,36,3])


def test():
    net = ResNet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

### 3.1 ResNet18

In [2]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os, sys
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

# Hyper-parameters
gpu_flag = True
input_size = 224
num_epochs = 3
batch_size = 50
learning_rate = 0.001

if gpu_flag:
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def get_accuracy(pred_y, test_y):
    accuracy = 0
    for i in range(len(test_y)):
        if pred_y[i] == test_y[i]:
            accuracy += 1
    accuracy /= len(test_y)
    return accuracy

PyTorch Version:  1.7.1+cu110
Torchvision Version:  0.8.2+cu110


#### 3.1.1 准备数据

In [3]:
# Image preprocessing modules

data_transforms = transforms.Compose([
    transforms.Resize(input_size),
    transforms.ToTensor()
])

# Mnist 手写数字
train_dataset = torchvision.datasets.MNIST(root='./data/',
                                             train=True, 
                                             transform=data_transforms,
                                             download=False)

test_dataset = torchvision.datasets.MNIST(root='./data/',
                                            train=False, 
                                            transform=data_transforms)

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)

#### 3.1.2 网络加载

In [4]:
resnet = models.resnet18(pretrained=False)
resnet = resnet.to(device)

#### 3.1.3 损失函数和优化器

In [5]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet.parameters(), lr=learning_rate)

# For updating learning rate
def update_lr(optimizer, lr):    
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

# Train the model
total_step = len(train_loader)
curr_lr = learning_rate

#### 3.1.4 模型训练

In [6]:
for epoch in range(num_epochs):
    sta_time = time.time()
    for i, (images, labels) in enumerate(train_loader):
        images = images.expand(-1, 3, -1, -1)
        sta_b_time = time.time()
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = resnet(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        end_b_time = time.time()
        if (i) % 100 == 0:
            log = "Batch [{}/{}], Time {}".format(i, len(train_loader), end_b_time-sta_b_time)
            print(log)

        if (i+1) % 100 == 0:
            log = "Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(epoch+1, num_epochs, i+1, total_step, loss.item())
            print (log)

    end_time = time.time()
    print("Epoch [{}/{}], Time {}"
                   .format(epoch+1, num_epochs, end_time-sta_time))
    # Decay learning rate
    if (epoch+1) % 20 == 0:
        curr_lr /= 3
        update_lr(optimizer, curr_lr)

Batch [0/1200], Time 0.09721040725708008
Epoch [1/3], Step [100/1200] Loss: 0.1729
Batch [100/1200], Time 0.03025531768798828
Epoch [1/3], Step [200/1200] Loss: 0.2664
Batch [200/1200], Time 0.03590130805969238
Epoch [1/3], Step [300/1200] Loss: 0.4628
Batch [300/1200], Time 0.03324699401855469
Epoch [1/3], Step [400/1200] Loss: 0.0248
Batch [400/1200], Time 0.03175830841064453
Epoch [1/3], Step [500/1200] Loss: 0.0463
Batch [500/1200], Time 0.0357973575592041
Epoch [1/3], Step [600/1200] Loss: 0.0752
Batch [600/1200], Time 0.036116600036621094
Epoch [1/3], Step [700/1200] Loss: 0.0326
Batch [700/1200], Time 0.031226396560668945
Epoch [1/3], Step [800/1200] Loss: 0.0412
Batch [800/1200], Time 0.031219005584716797
Epoch [1/3], Step [900/1200] Loss: 0.0213
Batch [900/1200], Time 0.040250301361083984
Epoch [1/3], Step [1000/1200] Loss: 0.0210
Batch [1000/1200], Time 0.03216266632080078
Epoch [1/3], Step [1100/1200] Loss: 0.0844
Batch [1100/1200], Time 0.03303670883178711
Epoch [1/3], Step

### 3.2 ResNet50 + Cifar 10 + Pretrain

#### 3.2.1 ResNet50模型

In [8]:
import torchvision.models as models

class ImagenetTransferLearning(nn.Module):
    def __init__(self):
        super().__init__()

        # init a pretrained resnet
        backbone = models.resnet50(pretrained=False)
        backbone.load_state_dict(torch.load('./checkpoints/resnet50-19c8e357.pth'))
        num_filters = backbone.fc.in_features
        layers = list(backbone.children())[:-1]
        self.feature_extractor = nn.Sequential(*layers)

        # use the pretrained model to classify cifar-10 (10 image classes)
        num_target_classes = 10
        self.classifier = nn.Linear(num_filters, num_target_classes)

    def forward(self, x):
        self.feature_extractor.eval()
        with torch.no_grad():
            representations = self.feature_extractor(x).flatten(1)
        x = self.classifier(representations)
        return x
model = ImagenetTransferLearning()
model = model.to(device)

#### 3.2.2 准备数据

In [10]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
num_epochs = 1
learning_rate = 0.001

# Image preprocessing modules
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data/',
                                             train=True, 
                                             transform=data_transforms['train'],
                                             download=False)

test_dataset = torchvision.datasets.CIFAR10(root='./data/',
                                            train=False, 
                                            transform=data_transforms['val'])

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100, 
                                          shuffle=False)

#### 3.2.3 训练模型

In [11]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# For updating learning rate
def update_lr(optimizer, lr):    
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

# Train the model
total_step = len(train_loader)
curr_lr = learning_rate

for epoch in range(num_epochs):
    sta_time = time.time()
    for i, (images, labels) in enumerate(train_loader):
        sta_b_time = time.time()
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        end_b_time = time.time()
        log = "Batch [{}/{}], Time {}".format(i, len(train_loader), end_b_time-sta_b_time)
        with open('./results/log_{}.txt'.format(gpu_flag), 'a') as f:
            f.write(log + "\n")
        print(log)

        if (i+1) % 100 == 0:
            log = "Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(epoch+1, num_epochs, i+1, total_step, loss.item())
            print (log)

    end_time = time.time()
    print("Epoch [{}/{}], Time {}"
                   .format(epoch+1, num_epochs, end_time-sta_time))
    # Decay learning rate
    if (epoch+1) % 20 == 0:
        curr_lr /= 3
        update_lr(optimizer, curr_lr)


Batch [0/500], Time 0.036528825759887695
Batch [1/500], Time 0.023430585861206055
Batch [2/500], Time 0.024915695190429688
Batch [3/500], Time 0.0715031623840332
Batch [4/500], Time 0.05972647666931152
Batch [5/500], Time 0.039322853088378906
Batch [6/500], Time 0.07664036750793457
Batch [7/500], Time 0.026085615158081055
Batch [8/500], Time 0.024243831634521484
Batch [9/500], Time 0.05226445198059082
Batch [10/500], Time 0.0223541259765625
Batch [11/500], Time 0.07236552238464355
Batch [12/500], Time 0.057222604751586914
Batch [13/500], Time 0.046029090881347656
Batch [14/500], Time 0.043637990951538086
Batch [15/500], Time 0.04574465751647949
Batch [16/500], Time 0.022602081298828125
Batch [17/500], Time 0.022710084915161133
Batch [18/500], Time 0.05513763427734375
Batch [19/500], Time 0.04532766342163086
Batch [20/500], Time 0.023118019104003906
Batch [21/500], Time 0.07686424255371094
Batch [22/500], Time 0.04053497314453125
Batch [23/500], Time 0.02333855628967285
Batch [24/500], 

#### 3.2.4 测试模型性能

In [12]:
# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))



Accuracy of the model on the test images: 86.68 %


#### 3.2.5 保存模型

In [26]:
# Save the model checkpoint
torch.save(model.state_dict(), './checkpoints/resnet.ckpt')