<a href="https://colab.research.google.com/github/bk62/deep-learning-notebooks/blob/main/generative_deep_learning/01_CNN_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

>[-1. Dataset: Fashion MNIST](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=Kue5MyJJvn6o)

>[Prelude: Image classification with MLP](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=QPcJWJ7ovZgU)

>>[Model](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=QwAUO7ogdHzI)

>>[Training](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=xxLzE6mpdJ50)

>[CNN from scratch](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=kb8UhiA_BcGv)

>>[Model](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=G9XX7NUkdMID)

>>[Sanity check](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=uGGcVomedOEG)

>>[Training](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=_BFzSjfEVE8j)

>>[Final result](#updateTitle=true&folderId=1mwghr2cgAwLd91FL_Ep6CoG2qkkpYyUK&scrollTo=15Dhpy7vcwPH)



# -1. Dataset: Fashion MNIST

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:01<00:00, 18477531.45it/s]


Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 305910.35it/s]


Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4422102/4422102 [00:00<00:00, 5332748.36it/s]


Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 6513507.39it/s]

Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw






In [3]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


In [4]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


# 0. Prelude: Image classification with MLP

## Model

In [5]:
class MLP(nn.Module):
  def __init__(self, n):
    super().__init__()
    self.flatten = nn.Flatten()
    self.stack = nn.Sequential(
        nn.Linear(n, 512),
        nn.ReLU(),
        nn.Linear(512, 512),
        nn.ReLU(),
        nn.Linear(512, 10),
        nn.Softmax(1),
    )

  def forward(self, x):
    x = self.flatten(x)
    probs = self.stack(x)
    return probs

model = MLP(28*28).to(device)
print(model)

total_params = 0
for name, params in model.named_parameters():
  print(name, params.numel())
  total_params += params.numel()
print(f"Total params: {total_params}")

MLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): Softmax(dim=1)
  )
)
stack.0.weight 401408
stack.0.bias 512
stack.2.weight 262144
stack.2.bias 512
stack.4.weight 5120
stack.4.bias 10
Total params: 669706


## Training

In [6]:
loss = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

In [7]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [8]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss, opt)
    test(test_dataloader, model, loss)
print("Done!")

Epoch 1
-------------------------------
loss: 2.303153  [   64/60000]
loss: 1.767309  [ 6464/60000]
loss: 1.636677  [12864/60000]
loss: 1.702919  [19264/60000]
loss: 1.725078  [25664/60000]
loss: 1.678324  [32064/60000]
loss: 1.632666  [38464/60000]
loss: 1.650415  [44864/60000]
loss: 1.698492  [51264/60000]
loss: 1.658300  [57664/60000]
Test Error: 
 Accuracy: 81.7%, Avg loss: 1.643641 

Epoch 2
-------------------------------
loss: 1.612846  [   64/60000]
loss: 1.615519  [ 6464/60000]
loss: 1.597406  [12864/60000]
loss: 1.678277  [19264/60000]
loss: 1.662121  [25664/60000]
loss: 1.647781  [32064/60000]
loss: 1.592896  [38464/60000]
loss: 1.605031  [44864/60000]
loss: 1.628901  [51264/60000]
loss: 1.656769  [57664/60000]
Test Error: 
 Accuracy: 81.7%, Avg loss: 1.644313 

Epoch 3
-------------------------------
loss: 1.615448  [   64/60000]
loss: 1.582928  [ 6464/60000]
loss: 1.585325  [12864/60000]
loss: 1.645452  [19264/60000]
loss: 1.651247  [25664/60000]
loss: 1.634772  [32064/600

## Final result
Test set:

Accuracy
84.9%

Loss
1.611381

In [9]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"


# 1. CNN from scratch

## Model

In [10]:
class SimpleCNN(nn.Module):
  def __init__(self, n):
    super().__init__()
    self.stack = nn.Sequential(
        # Conv 1
        nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1),
        nn.BatchNorm2d(32),
        nn.LeakyReLU(),

        # Conv 2
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
        nn.BatchNorm2d(32),
        nn.LeakyReLU(),

        # Conv 3
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1),
        nn.BatchNorm2d(64),
        nn.LeakyReLU(),

        # Conv 4
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=2),
        nn.BatchNorm2d(64),
        nn.LeakyReLU(),

        nn.Flatten(),
        nn.Linear(1024, 128),
        nn.BatchNorm1d(num_features=128),
        nn.LeakyReLU(),

        # Dropout
        nn.Dropout(0.5),

        nn.Linear(128, 10),
        nn.Softmax(dim=1),
    )

  def forward(self, x):
    probs = self.stack(x)
    return probs

model = SimpleCNN(28).to(device)
print(model)

total_params = 0
for name, params in model.named_parameters():
  print(name, params.numel())
  total_params += params.numel()
print(f"Total params: {total_params}")

SimpleCNN(
  (stack): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): LeakyReLU(negative_slope=0.01)
    (9): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2))
    (10): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): LeakyReLU(negative_slope=0.01)
    (12): Flatten(start_dim=1, end_dim=-1)
    (13): Linear(in_features=1024, out_features=128, bias=True)
    (14): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Leaky

## Sanity check

Checking that the sizes match our expectations:

In [11]:
c1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1)
x = X[0]
y1 = c1(x)
x.shape, y1.shape

(torch.Size([1, 28, 28]), torch.Size([32, 26, 26]))

In [12]:
x11 = X[0:1]
print(x11.shape)
y11 = c1(x11)
print(y11.shape)
bn = nn.BatchNorm2d(num_features=32)
y11bn = bn(y11)
y11.shape, y11bn.shape

torch.Size([1, 1, 28, 28])
torch.Size([1, 32, 26, 26])


(torch.Size([1, 32, 26, 26]), torch.Size([1, 32, 26, 26]))

First Conv:

In: 28 * 28 image with 1 channel

Out: 26 * 26 with 32 channels

In [13]:
c2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2)
y2 = c2(y1)
y1.shape, y2.shape

(torch.Size([32, 26, 26]), torch.Size([32, 12, 12]))

Second Conv:

In: 26 * 26 image with 32 channels

Out: 12 * 12 with 32 channels

In [14]:
c3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1)
y3 = c3(y2)
y2.shape, y3.shape

(torch.Size([32, 12, 12]), torch.Size([64, 10, 10]))

Third Conv:

In: 12 * 12 image with 32 channels

Out: 10 * 10 with 64 channels

In [15]:
c4 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=2)
y4 = c4(y3)
y3.shape, y4.shape

(torch.Size([64, 10, 10]), torch.Size([64, 4, 4]))

Fourth (and final) Conv:

In: 10 * 10 image with 64 channels

Out: 4 * 4 with 64 channels

Therefore, after all 4 convolutions, the input shape is
`[64, 4, 4]` with
$$\textrm{num elements} =
64 \cdot 4 \cdot 4 = 1024 $$

In [20]:
x = X[0:2].to(device)
y = model(x)
x.shape, y.shape, y.sum(1)

(torch.Size([2, 1, 28, 28]),
 torch.Size([2, 10]),
 tensor([1., 1.], device='cuda:0', grad_fn=<SumBackward1>))

## Training

In [21]:
loss = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

In [22]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss, opt)
    test(test_dataloader, model, loss)
print("Done!")

Epoch 1
-------------------------------
loss: 1.561404  [   64/60000]
loss: 1.588964  [ 6464/60000]
loss: 1.524415  [12864/60000]
loss: 1.606976  [19264/60000]
loss: 1.573387  [25664/60000]
loss: 1.613041  [32064/60000]
loss: 1.576545  [38464/60000]
loss: 1.606741  [44864/60000]
loss: 1.576480  [51264/60000]
loss: 1.545722  [57664/60000]
Test Error: 
 Accuracy: 87.7%, Avg loss: 1.583819 

Epoch 2
-------------------------------
loss: 1.607596  [   64/60000]
loss: 1.579967  [ 6464/60000]
loss: 1.517097  [12864/60000]
loss: 1.574990  [19264/60000]
loss: 1.576660  [25664/60000]
loss: 1.629848  [32064/60000]
loss: 1.592491  [38464/60000]
loss: 1.590710  [44864/60000]
loss: 1.549518  [51264/60000]
loss: 1.561827  [57664/60000]
Test Error: 
 Accuracy: 89.0%, Avg loss: 1.572419 

Epoch 3
-------------------------------
loss: 1.543245  [   64/60000]
loss: 1.600298  [ 6464/60000]
loss: 1.519070  [12864/60000]
loss: 1.599556  [19264/60000]
loss: 1.551137  [25664/60000]
loss: 1.594736  [32064/600

## Final result
Test set:

Accuracy
90.5%

Loss
1.556153

In [23]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
for ix in range(1, 10):
  x, y = test_data[ix][0], test_data[ix][1]
  with torch.no_grad():
      x = x.to(device)
      pred = model(x.view(1, 1, 28, 28)) # batch size 1
      predicted, actual = classes[pred[0].argmax(0)], classes[y]
      print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Pullover", Actual: "Pullover"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Shirt", Actual: "Shirt"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Coat", Actual: "Coat"
Predicted: "Shirt", Actual: "Shirt"
Predicted: "Sandal", Actual: "Sandal"
Predicted: "Sneaker", Actual: "Sneaker"
