In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader
import torchsummary

## **Data**
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-iamges-idx3-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-iamges-idx3-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-iamges-idx1-ubyte.gz  
`{root}\FashionMNIST\raw`

In [2]:
trans = transforms.Compose([transforms.Resize((96, 96)),  # upscale
                            transforms.ToTensor()])

data_train = torchvision.datasets.FashionMNIST(
    root='./data', train=True, transform=trans, download=False 
)
data_val = torchvision.datasets.FashionMNIST(
    root='./data', train=False, transform=trans, download=False
)

In [3]:
data_train

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               Resize(size=(96, 96), interpolation=bilinear, max_size=None, antialias=warn)
               ToTensor()
           )

In [4]:
data_val

Dataset FashionMNIST
    Number of datapoints: 10000
    Root location: ./data
    Split: Test
    StandardTransform
Transform: Compose(
               Resize(size=(96, 96), interpolation=bilinear, max_size=None, antialias=warn)
               ToTensor()
           )

In [5]:
image, label = data_train[0]  # [image, label]
print(image.shape) # (channel, height, weight)
print(label)

torch.Size([1, 96, 96])
9


## **Inception Blocks**
As depicted in Fig. 8.4.1, the inception block consists of four parallel branches. The first three branches use convolutional layers with window sizes of 1 by 10, 3 by 3, and 5 by 5 to extract information from different spatial sizes. The middle two branches also add a 1 by 1 convolution of the input to reduce the number of channels, reducing the model’s complexity. The fourth branch uses a 3 by 3 max-pooling layer, followed by a 1 by 1 convolutional layer to change the number of channels. The four branches all use appropriate padding to give the input and output the same height and width. Finally, the outputs along each branch are concatenated along the channel dimension and comprise the block’s output. The commonly-tuned hyperparameters of the Inception block are the number of output channels per layer, i.e., how to allocate capacity among convolutions of different size.

In [6]:
class Inception(nn.Module):
    def __init__(self, c1, c2, c3, c4):
        super().__init__()
        
        # branch 1
        self.b1_1 = nn.LazyConv2d(out_channels=c1, kernel_size=1)

        # branch 2
        self.b2_1 = nn.LazyConv2d(out_channels=c2[0], kernel_size=1)
        self.b2_2 = nn.LazyConv2d(out_channels=c2[1], kernel_size=3, padding=1)

        # branch 3
        self.b3_1 = nn.LazyConv2d(out_channels=c3[0], kernel_size=1)
        self.b3_2 = nn.LazyConv2d(out_channels=c3[1], kernel_size=5, padding=2)

        # branch 4
        self.b4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.b4_2 = nn.LazyConv2d(out_channels=c4, kernel_size=1)

    def forward(self, X):
        b1 = F.relu(self.b1_1(X))
        b2 = F.relu(self.b2_2(F.relu(self.b2_1(X))))
        b3 = F.relu(self.b3_2(F.relu(self.b3_1(X))))
        b4 = F.relu(self.b4_2(self.b4_1(X)))
        return torch.cat([b1, b2, b3, b4], dim=1)

## **GoogleNet**

In [7]:
class GoogleNet(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()

        self.m1 = nn.Sequential(
            nn.LazyConv2d(out_channels=64, kernel_size=7, stride=2, padding=3), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            nn.LazyConv2d(out_channels=64, kernel_size=1), nn.ReLU(),
            nn.LazyConv2d(out_channels=192, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.m2 = nn.Sequential(
            Inception(64, (96, 128), (16, 32), 32),
            Inception(128, (128, 192), (32, 96), 64),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.m3 = nn.Sequential(
            Inception(192, (96, 208), (16, 48), 64),
            Inception(160, (112, 224), (24, 64), 64),
            Inception(128, (128, 256), (24, 64), 64),
            Inception(112, (144, 288), (32, 64), 64),
            Inception(256, (160, 320), (32, 128), 128),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.m4 = nn.Sequential(
            Inception(256, (160, 320), (32, 128), 128),
            Inception(384, (192, 384), (48, 128), 128),
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten()
        )
        
        self.net = nn.Sequential(
            self.m1,
            self.m2,
            self.m3,
            self.m4,
            nn.LazyLinear(out_features=num_classes)
        )

    def forward(self, X):
        return self.net(X)

In [8]:
torchsummary.summary(GoogleNet(), input_size=(1, 96, 96))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 48, 48]           3,200
            Conv2d-2           [-1, 64, 48, 48]           3,200
              ReLU-3           [-1, 64, 48, 48]               0
              ReLU-4           [-1, 64, 48, 48]               0
         MaxPool2d-5           [-1, 64, 24, 24]               0
         MaxPool2d-6           [-1, 64, 24, 24]               0
            Conv2d-7           [-1, 64, 24, 24]           4,160
            Conv2d-8           [-1, 64, 24, 24]           4,160
              ReLU-9           [-1, 64, 24, 24]               0
             ReLU-10           [-1, 64, 24, 24]               0
           Conv2d-11          [-1, 192, 24, 24]         110,784
           Conv2d-12          [-1, 192, 24, 24]         110,784
             ReLU-13          [-1, 192, 24, 24]               0
             ReLU-14          [-1, 192,



## **Training**

In [9]:
batch_size = 128

train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(data_val, batch_size=batch_size, shuffle=False)

In [10]:
model = GoogleNet()

In [11]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)

In [12]:
def accuracy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    preds = y_hat.argmax(axis=1).type(y.dtype)  # (B)
    compare = (preds == y).type(torch.float32)  # (B)
    return compare.sum()

In [None]:
%%time
for i in range(10):
    model.train()

    train_loss = 0
    num_train_batches = 0
    for b, (X, y) in enumerate(train_loader):
        optimizer.zero_grad()
        y_hat = model(X)
        loss = F.cross_entropy(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        num_train_batches += 1
        if b % 10 == 0:
            print(f'epoch={i} | batch={b} | train_loss={train_loss/num_train_batches:.4f}')

    model.eval()
    with torch.no_grad():
        val_loss = 0
        num_val_batches = 0
        val_acc = 0
        total = 0
        for X, y in val_loader:
            y_hat = model(X)
            loss = F.cross_entropy(y_hat, y)
            val_loss += loss.item()
            num_val_batches += 1
            val_acc += accuracy(y_hat, y)
            total += y.numel()
        
    print(f'epoch={i} | train_loss={train_loss/num_train_batches:.4f} | val_loss={val_loss/num_val_batches:.4f} | val_acc={val_acc/total:.4f}')