![title](./picture/GooLeNet.jpg)
## Inception Module
#### 在构造神经网络时，有一些超参数很难选(如，Kernel_size)，不知道哪种卷积好用，所以GooLeNet就在一个块里，将这几种卷积都用一遍，
#### 然后把他们结果摞到一起，如果结果3×3的卷积好用，那么这条分支的权重就变得比较大，而其他分支的权重就会变小。
#### 总之，就是提供几种候选的卷积神经网络的配置，然后通过训练在这几条分支中自动的找到最优的卷积组合。
### 注意:
#### (1) 每条分支出来的图像 w×h 必须相同才能拼接，为了满足在Conv可以设置padding，在Pooling可以设置stride和padding
#### (2) 1×1卷积是做信息融合，最主要的作用就是改变通道的数量，(因为c×w×h的图像经过一个1×1的卷积，得到一个1×w×h的图像，
#### 那么通过m个1×1的卷积就能得到m×w×h的图像)从而减少运算量，具体见下图：(在时间效率上额可以提高10倍左右!)
![title](./picture/GooLeNet0.jpg)

In [1]:
import torch
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

In [2]:
batch_size = 64
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307, ), (0.3801,))
])

In [3]:
train_dataset = datasets.MNIST(root='./dataset/mnist/',
                               train=True,
                               download=False,
                               transform=transform)

train_loader = DataLoader(train_dataset,
                          shuffle=True,
                          batch_size=batch_size)

In [4]:
test_dataset = datasets.MNIST(root='./dataset/mnist/',
                               train=False,
                               download=False,
                               transform=transform)

test_loader = DataLoader(test_dataset,
                          shuffle=False,
                          batch_size=batch_size)

In [2]:
%%html
<img src='./picture/CNN7.jpg', width=800>
<img src='./picture/CNN8.jpg', width=800>
# outputs = [branch1x1, branch5x5, branch3x3, branch_pool]
# return torch.cat(outputs, dim=1)

In [6]:
class InceptionA(nn.Module):
    def __init__(self, in_channels):
        super(InceptionA, self).__init__()
        self.branch1x1 = nn.Conv2d(in_channels, 16, kernel_size=1)
        
        self.branch5x5_1 = nn.Conv2d(in_channels,16, kernel_size=1)
        self.branch5x5_2 = nn.Conv2d(16, 24, kernel_size=5, padding=2)
        
        self.branch3x3_1 = nn.Conv2d(in_channels, 16, kernel_size=1)
        self.branch3x3_2 = nn.Conv2d(16, 24, kernel_size=3, padding=1)
        self.branch3x3_3 = nn.Conv2d(24, 24, kernel_size=3, padding=1)
        
        self.branch_pool = nn.Conv2d(in_channels, 24, kernel_size=1)
        
    def forward(self, x):
        branch1x1 = self.branch1x1(x)
        
        branch5x5 = self.branch5x5_1(x)
        branch5x5 = self.branch5x5_2(branch5x5)
        
        branch3x3 = self.branch3x3_1(x)
        branch3x3 = self.branch3x3_2(branch3x3)
        branch3x3 = self.branch3x3_3(branch3x3)
        
        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) # 第一个分支的池化层
        branch_pool = self.branch_pool(branch_pool)
        
        outputs = [branch1x1, branch5x5, branch3x3, branch_pool]
        return torch.cat(outputs, dim=1) #沿着dim=1的维度拼起来，因为张量的维度是(b,c,w,h)，要沿着c这个维度拼起来

In [7]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(88, 20, kernel_size=5)
        
        self.incep1 = InceptionA(in_channels=10)
        self.incep2 = InceptionA(in_channels=20)
        
        self.mp = nn.MaxPool2d(2)
        self.fc = nn.Linear(1408, 10)
        
    def forward(self, x):                    # x.size()为 [in_size, 1, 28, 28]
        in_size = x.size(0)
        x = F.relu(self.mp(self.conv1(x)))   # x.size()为 [in_size, 10, 12, 12], 其中, 12 = (28-4) / 2
        x = self.incep1(x)                   # x.size()为 [in_size, 88, 12, 12]
        x = F.relu(self.mp(self.conv2(x)))   # x.size()为 [in_size, 20, 4, 4],   其中, 4 = (12-4) / 2
        x = self.incep2(x)                   # x.size()为 [in_size, 88, 4, 4]
        x = x.view(in_size, -1)              # x.size()为 [in_size, 1408] 
        x = self.fc(x)
        return x

In [8]:
model = Net()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

Net(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(88, 20, kernel_size=(5, 5), stride=(1, 1))
  (incep1): InceptionA(
    (branch1x1): Conv2d(10, 16, kernel_size=(1, 1), stride=(1, 1))
    (branch5x5_1): Conv2d(10, 16, kernel_size=(1, 1), stride=(1, 1))
    (branch5x5_2): Conv2d(16, 24, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (branch3x3_1): Conv2d(10, 16, kernel_size=(1, 1), stride=(1, 1))
    (branch3x3_2): Conv2d(16, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (branch3x3_3): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (branch_pool): Conv2d(10, 24, kernel_size=(1, 1), stride=(1, 1))
  )
  (incep2): InceptionA(
    (branch1x1): Conv2d(20, 16, kernel_size=(1, 1), stride=(1, 1))
    (branch5x5_1): Conv2d(20, 16, kernel_size=(1, 1), stride=(1, 1))
    (branch5x5_2): Conv2d(16, 24, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (branch3x3_1): Conv2d(20, 16, kernel_size=(1, 1), stride=(1, 1))
   

In [9]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [10]:
def train(epoch):
    run_loss = 0
    for batch_idx, [inputs, labels] in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        run_loss += loss.item()
    
        if(batch_idx % 300 == 299):
            print('[%d, %5d], loss: %.3f' % (epoch + 1, batch_idx + 1, run_loss / 300))
            run_loss = 0

In [11]:
def test():
    correct = 0
    total = 0
    with torch.no_grad():
        for [inputs, labels] in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, pred = torch.max(outputs.data, dim=1)
            total += labels.size(0)
            correct += (pred == labels).sum().item()
    print('Accuracy on test set: %d %% [%d/%d]' % (100 * correct / total, correct, total) )

In [12]:
if __name__ == '__main__':
    for i in range(10):
        train(i)
        test()

[1,   300], loss: 1.070
[1,   600], loss: 0.203
[1,   900], loss: 0.144
Accuracy on test set: 96 % [9606/10000]
[2,   300], loss: 0.120
[2,   600], loss: 0.105
[2,   900], loss: 0.090
Accuracy on test set: 97 % [9762/10000]
[3,   300], loss: 0.084
[3,   600], loss: 0.073
[3,   900], loss: 0.079
Accuracy on test set: 98 % [9836/10000]
[4,   300], loss: 0.061
[4,   600], loss: 0.068
[4,   900], loss: 0.064
Accuracy on test set: 98 % [9830/10000]
[5,   300], loss: 0.059
[5,   600], loss: 0.054
[5,   900], loss: 0.054
Accuracy on test set: 98 % [9818/10000]
[6,   300], loss: 0.051
[6,   600], loss: 0.049
[6,   900], loss: 0.048
Accuracy on test set: 98 % [9853/10000]
[7,   300], loss: 0.043
[7,   600], loss: 0.045
[7,   900], loss: 0.049
Accuracy on test set: 98 % [9881/10000]
[8,   300], loss: 0.037
[8,   600], loss: 0.040
[8,   900], loss: 0.048
Accuracy on test set: 98 % [9878/10000]
[9,   300], loss: 0.038
[9,   600], loss: 0.039
[9,   900], loss: 0.035
Accuracy on test set: 98 % [9880