# MNIST Image Classification with Convolutional Networks in PyTorch

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchvision import datasets, transforms

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f'Using device: {device}')

Using device: cuda


# MNIST Dataset 
* From PyTorch built-in datasets
* convert images to tensors
* normalize pixel values with a mean of 0.5 and a standard deviation of 0.5
* Input is grayscale (single color channel, we have single values for mean and std)

In [2]:
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5,), std=(0.5,)),
])

In [3]:
train_dataset = datasets.MNIST(
    root='./data', 
    train=True, 
    download=True,               
    transform=transform
)

test_dataset = datasets.MNIST(
    root='./data', 
    train=False,
    transform=transform
)

### See all classes
* integer mapping of string names
* (Remember models don't work with strings)

In [4]:
train_dataset.classes

['0 - zero',
 '1 - one',
 '2 - two',
 '3 - three',
 '4 - four',
 '5 - five',
 '6 - six',
 '7 - seven',
 '8 - eight',
 '9 - nine']

In [5]:
NUM_CLASSES = len(train_dataset.classes)
NUM_CLASSES

10

In [6]:
ex_img, ex_target = train_dataset[0] # img, seg_mask

print(ex_img.shape)
print(ex_target) 

torch.Size([1, 28, 28])
5


# Dataloader

In [7]:
batch_size = 32

train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
)

test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
)

### Get sample batch data

In [8]:
ex_img_batch, ex_target_batch = next(iter(train_dataloader))
print(ex_img_batch.shape)
print(ex_target_batch.shape)

torch.Size([32, 1, 28, 28])
torch.Size([32])


# Model

### Convolution Operation

In [9]:
x_r = torch.randn(8, 1, 28, 28)
x_r.shape

torch.Size([8, 1, 28, 28])

In [10]:
test_conv_layer = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3)

In [11]:
test_output = test_conv_layer(x_r)
test_output.shape

torch.Size([8, 32, 26, 26])

### Max Pooling Operation

In [12]:
test_pooling_layer = nn.MaxPool2d(kernel_size=2)

In [13]:
test_output2 = test_pooling_layer(x_r)
test_output2.shape

torch.Size([8, 1, 14, 14])

In [14]:
test_output3 = test_pooling_layer(test_output)
test_output3.shape

torch.Size([8, 32, 13, 13])

### Convolution Full Model

In [15]:
class ConvNet(nn.Module):
    def __init__(self, input_channels, num_classes):
        super().__init__()
        
        self.conv1 = nn.Conv2d(in_channels=input_channels, out_channels=32, kernel_size=3)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
        
        self.max_pool = nn.MaxPool2d(kernel_size=2)

        self.relu = nn.ReLU()
        
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        #######################
        # Convolutional Part
        #######################
        #print(f'Input dims: {x.shape}')
        
        x = self.conv1(x) # (N, 1, 28, 28) -> (N, 32, 26, 26)
        #print(f'After conv1 {x.shape}')
        x = self.relu(x) # no dim change
        x = self.conv2(x) # (N, 32, 26, 26) -> (N, 64, 24, 24)
        #print(f'After conv2 {x.shape}')
        x = self.relu(x) # no dim change
        x = self.max_pool(x) # (N, 64, 24, 24) -> (N, 64, 12, 12)
        #print(f'After maxpool {x.shape}')
        #######################
        #######################

        #######################
        ## Fully Connected Part
        #######################
        x = torch.flatten(x, 1) # (N, 64, 12, 12) -> (N, 64*12*12) -> (N, 9216)
        x = self.fc1(x) # (N, 9216) -> (N, 128)
        x = self.relu(x) # no dim change
        logits = self.fc2(x) # (N, 128) - (N, 10)
        #######################
        #######################
        
        return logits

### Dummy Input for Dimentional Testing

In [16]:
model = ConvNet(
    input_channels=1, # 1 for grayscale images 
    num_classes=NUM_CLASSES
)

In [17]:
dummy_input = torch.randn(1, 1, 28, 28)

In [18]:
dummy_preds = model(dummy_input)
dummy_preds.shape

torch.Size([1, 10])

## Print Model Parametrs

In [19]:
for p in model.parameters():
    print(p.shape)

torch.Size([32, 1, 3, 3])
torch.Size([32])
torch.Size([64, 32, 3, 3])
torch.Size([64])
torch.Size([128, 9216])
torch.Size([128])
torch.Size([10, 128])
torch.Size([10])


## Print with Names

In [20]:
for n, p in model.named_parameters():
    print(f'name: {n} and parameter data: {p.shape}')
    print()

name: conv1.weight and parameter data: torch.Size([32, 1, 3, 3])

name: conv1.bias and parameter data: torch.Size([32])

name: conv2.weight and parameter data: torch.Size([64, 32, 3, 3])

name: conv2.bias and parameter data: torch.Size([64])

name: fc1.weight and parameter data: torch.Size([128, 9216])

name: fc1.bias and parameter data: torch.Size([128])

name: fc2.weight and parameter data: torch.Size([10, 128])

name: fc2.bias and parameter data: torch.Size([10])



# Optimizer & Loss

In [21]:
model = model.to(device)

learning_rate = 0.02

optimizer = torch.optim.SGD(
    model.parameters(), 
    lr=learning_rate
)

criterion = nn.CrossEntropyLoss() # Negative log-likehood

# Training

In [22]:
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    
    loss_history = []
    
    for batch_idx, (img, target) in enumerate(train_loader):
        # Move to GPU (if available)
        img = img.to(device)
        target = target.to(device)

        # Forward pass
        preds = model(img)
        # Compute gradients
        loss = criterion(preds, target)
        
        # Zero gradients, perform a backward pass, and update the weights.
        # In PyTorch, gradients are accumulated, you need to reset gradients in each loop
        optimizer.zero_grad()
        # Compute gradients
        loss.backward()
        # Update parameters (weights and biases)
        optimizer.step()
        
        loss_history.append(loss.item())

    avg_loss = sum(loss_history)/len(loss_history)
    return avg_loss

# Testing
* No trainin in testing code
* Disable Autograd
* No optimizer

In [23]:
@torch.no_grad()
def test(model, test_loader, criterion):
    model.eval()
    
    loss_history = []
    acc_history = []
    
    for img, target in test_loader:
        # Move to GPU (if available)
        img = img.to(device)
        target = target.to(device)

        # Forward pass
        preds = model(img)
        # Compute error
        loss = criterion(preds, target)
        
        # Compute accuracy
        _, predicted = torch.max(preds, 1)
        accuracy = (predicted == target).sum().item() / target.size(0)

        loss_history.append(loss.item())
        acc_history.append(accuracy)
    
    avg_loss = sum(loss_history)/len(loss_history)
    avg_acc = sum(acc_history)/len(acc_history)
    return avg_loss, avg_acc

### Start Training
* Training consists of two steps: forward and backward propagation
* In forward propagation, we input the data into the model and measure the error (with loss function)
* In backward propagation, we adjust the internal paramters of the model so that model makes better predictions next time
* One complete cycle of the dataset is called "epoch" (one loop cycle of all data)

In [24]:
def start_training(model, train_dataloader, test_dataloader, optimizer, criterion, num_epochs, print_interval):

    # Loop over all epochs
    for epoch in range(1, NUM_EPOCHS+1):
        avg_train_loss = train(model, train_dataloader, optimizer, criterion, epoch)
        avg_test_loss, avg_test_acc = test(model, test_dataloader, criterion)

        if (epoch + 1) % print_interval == 0:
            print(f'Epoch: [{epoch+1}/{num_epochs}], Avg train loss: {avg_train_loss:.4f}, test loss: {avg_test_loss:.4f}, test_acc: {avg_test_acc*100.0:.2f}%')

In [25]:
NUM_EPOCHS = 10
print_interval = 2 

start_training(
    model,
    train_dataloader,
    test_dataloader,
    optimizer,
    criterion,
    NUM_EPOCHS,
    print_interval
)

Epoch: [2/10], Avg train loss: 0.2647, test loss: 0.0880, test_acc: 97.26%
Epoch: [4/10], Avg train loss: 0.0483, test loss: 0.0454, test_acc: 98.52%
Epoch: [6/10], Avg train loss: 0.0288, test loss: 0.0379, test_acc: 98.76%
Epoch: [8/10], Avg train loss: 0.0181, test loss: 0.0384, test_acc: 98.70%
Epoch: [10/10], Avg train loss: 0.0112, test loss: 0.0417, test_acc: 98.74%


# Save/Load Model

In [26]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
},
    'convnet_mnist_checkpoint.pt'
)