In [1]:
from __future__ import print_function
import torch
import torch.optim as optim
from torchvision import datasets, transforms
import torch.nn as nn
from datetime import datetime
import logging
from pathlib import Path
import torch.nn.functional as F
import numpy as np
import random
from torchsummary import summary
from torch.utils.data import DataLoader, Subset


In [2]:


def get_data_loaders(batch_size=64):

    train_transform=transforms.Compose([
                        transforms.Resize((28, 28)),
                        transforms.RandomRotation((-7.0, 7.0), fill=(1,)),
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    # Load full datasets
    train_dataset = datasets.MNIST('./data', train=True, download=True, transform=train_transform)
    test_dataset = datasets.MNIST('./data', train=False, transform=transform)

    # Create indices for 25% of training data
    # total_train = len(train_dataset)
    # indices = np.random.permutation(total_train)
    # train_size = int(0.25 * total_train)  # 25% of the data
    # train_indices = indices[:train_size]

    # Create subset of training data
    # train_dataset = Subset(train_dataset, train_indices)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # print(f"Training with {train_size:,} samples (25% of original {total_train:,} samples)")
    print(f"Training with {len(train_dataset)} samples")


    return train_loader, test_loader

In [3]:


DROP_OUT = 0.1

class NetGAP(nn.Module):
    """
    CNN with Global Average Pooling (GAP)
==================================================
Model Architecture: With GAP
==================================================
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1            [-1, 8, 28, 28]              80
              ReLU-2            [-1, 8, 28, 28]               0
       BatchNorm2d-3            [-1, 8, 28, 28]              16
            Conv2d-4            [-1, 8, 28, 28]             584
              ReLU-5            [-1, 8, 28, 28]               0
       BatchNorm2d-6            [-1, 8, 28, 28]              16
         MaxPool2d-7            [-1, 8, 14, 14]               0
           Dropout-8            [-1, 8, 14, 14]               0
            Conv2d-9           [-1, 16, 14, 14]           1,168
             ReLU-10           [-1, 16, 14, 14]               0
      BatchNorm2d-11           [-1, 16, 14, 14]              32
           Conv2d-12           [-1, 16, 14, 14]           2,320
             ReLU-13           [-1, 16, 14, 14]               0
      BatchNorm2d-14           [-1, 16, 14, 14]              32
        MaxPool2d-15             [-1, 16, 7, 7]               0
          Dropout-16             [-1, 16, 7, 7]               0
           Conv2d-17             [-1, 16, 7, 7]           2,320
             ReLU-18             [-1, 16, 7, 7]               0
      BatchNorm2d-19             [-1, 16, 7, 7]              32
           Conv2d-20             [-1, 10, 7, 7]             170
             ReLU-21             [-1, 10, 7, 7]               0
      BatchNorm2d-22             [-1, 10, 7, 7]              20
        MaxPool2d-23             [-1, 10, 3, 3]               0
          Dropout-24             [-1, 10, 3, 3]               0
AdaptiveAvgPool2d-25             [-1, 10, 1, 1]               0
================================================================
Total params: 6,790
================================================================


    Target:
    - Replace FC layer with Global Average Pooling
    - Reduce parameter count significantly under 8k
    - Maintain or improve accuracy compared to FC models
    - Make model morTraining With GAP model...
Training with 60000 samples
Epoch: 0 | Train Loss: 0.682 | Train Acc: 83.76% | Val Loss: 9.662 | Val Acc: 96.56% | Best Val Acc: 96.56%
Epoch: 1 | Train Loss: 0.177 | Train Acc: 95.61% | Val Loss: 5.702 | Val Acc: 97.65% | Best Val Acc: 97.65%
Epoch: 2 | Train Loss: 0.118 | Train Acc: 96.72% | Val Loss: 6.061 | Val Acc: 96.98% | Best Val Acc: 97.65%
Epoch: 3 | Train Loss: 0.100 | Train Acc: 97.06% | Val Loss: 3.488 | Val Acc: 98.21% | Best Val Acc: 98.21%
Epoch: 4 | Train Loss: 0.085 | Train Acc: 97.49% | Val Loss: 3.291 | Val Acc: 98.39% | Best Val Acc: 98.39%
Epoch: 5 | Train Loss: 0.074 | Train Acc: 97.83% | Val Loss: 2.967 | Val Acc: 98.55% | Best Val Acc: 98.55%
Epoch: 6 | Train Loss: 0.069 | Train Acc: 97.95% | Val Loss: 2.686 | Val Acc: 98.76% | Best Val Acc: 98.76%
Epoch: 7 | Train Loss: 0.060 | Train Acc: 98.22% | Val Loss: 2.716 | Val Acc: 98.62% | Best Val Acc: 98.76%
Epoch: 8 | Train Loss: 0.057 | Train Acc: 98.24% | Val Loss: 2.093 | Val Acc: 99.01% | Best Val Acc: 99.01%
Epoch: 9 | Train Loss: 0.051 | Train Acc: 98.43% | Val Loss: 1.725 | Val Acc: 99.19% | Best Val Acc: 99.19%
Epoch: 10 | Train Loss: 0.045 | Train Acc: 98.59% | Val Loss: 1.782 | Val Acc: 99.20% | Best Val Acc: 99.20%
Epoch: 11 | Train Loss: 0.039 | Train Acc: 98.81% | Val Loss: 1.606 | Val Acc: 99.15% | Best Val Acc: 99.20%
Epoch: 12 | Train Loss: 0.035 | Train Acc: 98.89% | Val Loss: 1.520 | Val Acc: 99.28% | Best Val Acc: 99.28%
Epoch: 13 | Train Loss: 0.032 | Train Acc: 99.03% | Val Loss: 1.446 | Val Acc: 99.29% | Best Val Acc: 99.29%
Epoch: 14 | Train Loss: 0.030 | Train Acc: 99.09% | Val Loss: 1.425 | Val Acc: 99.34% | Best Val Acc: 99.34%

===================================================
  Receptive Field (RF) calculation:
===================================================

    RF = 1 + sum((kernel_size - 1) * stride_product)
    stride_product = product of all previous strides

    Layer details:
    Conv1: RF_in=1, k=3, s=1, p=1 → RF_out=3
    Conv2: RF_in=3, k=3, s=1, p=1 → RF_out=5
    MaxPool1: RF_in=5, k=2, s=2 → RF_out=6

    Conv3: RF_in=6, k=3, s=1, p=1 → RF_out=10
    Conv4: RF_in=10, k=3, s=1, p=1 → RF_out=14
    MaxPool2: RF_in=14, k=2, s=2 → RF_out=16

    Conv5: RF_in=16, k=3, s=1, p=1 → RF_out=20
    Conv6: RF_in=20, k=1, s=1 → RF_out=20
    MaxPool3: RF_in=20, k=2, s=2 → RF_out=22

    Final RF = 22x22 pixels

===================================================

    """
    def __init__(self, dropout_rate=DROP_OUT):
        super(NetGAP, self).__init__()

        # First block: RF 1→3→5→6
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1,
                     out_channels=8,
                     kernel_size=3,
                     padding=1,
                     stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features=8),
            nn.Conv2d(in_channels=8,
                     out_channels=8,
                     kernel_size=3,
                     padding=1,
                     stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features=8),
            nn.MaxPool2d(kernel_size=2, stride=2),  # /2
            nn.Dropout(p=dropout_rate),
        )

        # Second block: RF 6→10→14→16
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=8,
                     out_channels=16,
                     kernel_size=3,
                     padding=1,
                     stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features=16),
            nn.Conv2d(in_channels=16,
                     out_channels=16,
                     kernel_size=3,
                     padding=1,
                     stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features=16),
            nn.MaxPool2d(kernel_size=2, stride=2),  # /2
            nn.Dropout(p=dropout_rate),
        )

        # Third block: RF 16→20→20→22
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=16,
                     out_channels=16,
                     kernel_size=3,
                     padding=1,
                     stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features=16),
            nn.Conv2d(in_channels=16,
                     out_channels=10,
                     kernel_size=1,  # 1x1 conv for channel reduction
                     stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features=10),
            nn.MaxPool2d(kernel_size=2, stride=2),  # /2
            nn.Dropout(p=dropout_rate),
        )

        # Global Average Pooling: maintains RF while reducing spatial dims to 1x1
        self.gap = nn.AdaptiveAvgPool2d(output_size=1)

    def forward(self, x):
        x = self.conv1(x)  # 28x28 -> 14x14
        x = self.conv2(x)  # 14x14 -> 7x7
        x = self.conv3(x)  # 7x7 -> 3x3
        x = self.gap(x)    # 3x3 -> 1x1
        x = x.view(-1, 10)
        return F.log_softmax(x, dim=1)

In [4]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
model = NetGAP().to(device)
summary(model, input_size=(1, 28, 28))



cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              80
              ReLU-2            [-1, 8, 28, 28]               0
       BatchNorm2d-3            [-1, 8, 28, 28]              16
            Conv2d-4            [-1, 8, 28, 28]             584
              ReLU-5            [-1, 8, 28, 28]               0
       BatchNorm2d-6            [-1, 8, 28, 28]              16
         MaxPool2d-7            [-1, 8, 14, 14]               0
           Dropout-8            [-1, 8, 14, 14]               0
            Conv2d-9           [-1, 16, 14, 14]           1,168
             ReLU-10           [-1, 16, 14, 14]               0
      BatchNorm2d-11           [-1, 16, 14, 14]              32
           Conv2d-12           [-1, 16, 14, 14]           2,320
             ReLU-13           [-1, 16, 14, 14]               0
      BatchNorm2d-14           [-1

In [5]:


def set_seed(seed=42):
    """Set seeds for reproducibility."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

def setup_logger():
    log_dir = Path('logs')
    log_dir.mkdir(exist_ok=True)

    logging.basicConfig(
        filename=f'logs/training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log',
        level=logging.INFO,
        format='%(asctime)s - %(message)s'
    )

In [7]:

def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        device_name = "Apple Silicon (M1/M2)"
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        device_name = f"CUDA ({torch.cuda.get_device_name(0)})"
    else:
        device = torch.device("cpu")
        device_name = "CPU"
    return device, device_name

In [8]:

def train(epochs=15, batch_size=64, learning_rate=0.01, target_accuracy=99.4):
    set_seed(42)  # Set seed for reproducibility
    setup_logger()

    # Device setup
    device, device_name = get_device()
    gpu_info = f"Using device: {device} ({device_name})"

    if device.type == 'cuda':
        gpu_info += f"\nMemory Usage:"
        gpu_info += f"\n  Allocated: {round(torch.cuda.memory_allocated(0)/1024**2,1)} MB"
        gpu_info += f"\n  Cached:    {round(torch.cuda.memory_reserved(0)/1024**2,1)} MB"

    print(gpu_info)
    logging.info(gpu_info)

    models = [
        ("With GAP", NetGAP())
    ]

    results = []

    for name, model in models:
        print(f"\n{'='*50}")
        print(f"Model Architecture: {name}")
        print(f"{'='*50}")
        # summary(model, input_size=(1, 28, 28))

        model = model.to(device)
        # Print model summary before training
        print(f"\nTraining {name} model...")

        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        train_loader, test_loader = get_data_loaders(batch_size)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,max_lr=0.01,epochs=epochs,
                                                        steps_per_epoch=len(train_loader))


        best_accuracy = 0.0
        early_stop = False

        for epoch in range(epochs):
            if early_stop:
                break

            model.train()
            train_loss = 0
            correct = 0
            total = 0

            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(device), target.to(device)
                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                optimizer.step()
                scheduler.step()

                train_loss += loss.item()
                _, predicted = output.max(1)
                total += target.size(0)
                correct += predicted.eq(target).sum().item()

            train_accuracy = 100. * correct / total
            train_loss = train_loss / len(train_loader)

            # Validation
            model.eval()
            val_loss = 0
            correct = 0
            total = 0

            with torch.no_grad():
                for data, target in test_loader:
                    data, target = data.to(device), target.to(device)
                    output = model(data)
                    val_loss += F.nll_loss(output, target, reduction='sum').item()
                    _, predicted = output.max(1)
                    total += target.size(0)
                    correct += predicted.eq(target).sum().item()

            val_accuracy = 100. * correct / total
            val_loss = val_loss / len(test_loader)

            # Update best accuracy
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy

            # Check for early stopping
            if val_accuracy >= target_accuracy:
                # early_stop = True
                print(f"\nReached target accuracy of {target_accuracy}% at epoch {epoch}")
                logging.info(f"Reached target accuracy of {target_accuracy}% at epoch {epoch}")

            # Log epoch results
            log_message = (f'Epoch: {epoch} | '
                        f'Train Loss: {train_loss:.3f} | '
                        f'Train Acc: {train_accuracy:.2f}% | '
                        f'Val Loss: {val_loss:.3f} | '
                        f'Val Acc: {val_accuracy:.2f}% | '
                        f'Best Val Acc: {best_accuracy:.2f}%')
            logging.info(log_message)
            print(log_message)

        results.append((name, best_accuracy))
        # Log GPU memory only for CUDA devices
        if device.type == 'cuda':
            memory_info = (f"GPU Memory: "
                        f"Allocated: {round(torch.cuda.memory_allocated(0)/1024**2,1)} MB, "
                        f"Cached: {round(torch.cuda.memory_reserved(0)/1024**2,1)} MB")
            logging.info(memory_info)
            print(memory_info)

    print("\nFinal Results:")
    for name, acc in results:
        print(f"{name}: {acc:.2f}%")
    return results

In [9]:
final_results = train()

Using device: cuda (CUDA (NVIDIA L4))
Memory Usage:
  Allocated: 0.0 MB
  Cached:    22.0 MB

Model Architecture: With GAP

Training With GAP model...
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9.91M/9.91M [00:01<00:00, 5.38MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.9k/28.9k [00:00<00:00, 157kB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.65M/1.65M [00:02<00:00, 738kB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.54k/4.54k [00:00<00:00, 9.74MB/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

Training with 60000 samples
Epoch: 0 | Train Loss: 0.803 | Train Acc: 79.13% | Val Loss: 10.706 | Val Acc: 96.57% | Best Val Acc: 96.57%
Epoch: 1 | Train Loss: 0.196 | Train Acc: 95.11% | Val Loss: 5.443 | Val Acc: 97.69% | Best Val Acc: 97.69%
Epoch: 2 | Train Loss: 0.132 | Train Acc: 96.33% | Val Loss: 4.072 | Val Acc: 98.02% | Best Val Acc: 98.02%
Epoch: 3 | Train Loss: 0.113 | Train Acc: 96.64% | Val Loss: 3.087 | Val Acc: 98.45% | Best Val Acc: 98.45%
Epoch: 4 | Train Loss: 0.093 | Train Acc: 97.26% | Val Loss: 3.231 | Val Acc: 98.37% | Best Val Acc: 98.45%
Epoch: 5 | Train Loss: 0.085 | Train Acc: 97.45% | Val Loss: 2.337 | Val Acc: 98.83% | Best Val Acc: 98.83%
Epoch: 6 | Train Loss: 0.076 | Train Acc: 97.83% | Val Loss: 1.886 | Val Acc: 99.11% | Best Val Acc: 99.11%
Epoch: 7 | Train Loss: 0.069 | Train Acc: 98.00% | Val Loss: 1.583 | Val Acc: 99.23% | Best Val Acc: 99.23%
Epoch: 8 | Train Loss: 0.061 | T

In [10]:
final_results

[('With GAP', 99.45)]

In [11]:
!hostname

ip-172-31-9-229
