# GPU-Accelerated CNN Training on CIFAR-10 (PyTorch + CUDA)

This project benchmarks the performance difference between CPU and GPU training for a small convolutional neural network (CNN) on the CIFAR-10 dataset.  

Using PyTorch with CUDA in Google Colab, the model was trained on both CPU and GPU to measure training time and accuracy.  
Results highlight the practical speedup enabled by GPU acceleration in deep learning workflows.


##Verifying CUDA

In [2]:
import torch, torchvision, sys, platform
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
!nvidia-smi

Torch: 2.5.1+cu121 | CUDA available: True
Sat Aug 16 05:44:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
      

## Installing/confirming libs

In [3]:
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


## Set up data + transforms

In [4]:
import torch, torchvision
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

BATCH_SIZE = 128

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914,0.4822,0.4465),(0.2470,0.2435,0.2616))
])

trainset = datasets.CIFAR10(root="./data", train=True,  download=True, transform=transform)
testset  = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
testloader  = DataLoader(testset,  batch_size=256, shuffle=False, num_workers=2, pin_memory=True)


Files already downloaded and verified
Files already downloaded and verified


## Defining a tiny CNN

In [5]:
import torch.nn as nn

class SmallCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32,64,3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64,128,3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1,1))
        )
        self.fc = nn.Linear(128, num_classes)
    def forward(self, x):
        x = self.net(x)
        return self.fc(x.view(x.size(0), -1))

## Train+time helper definition

In [6]:
import time
import torch.optim as optim
import torch.nn.functional as F

def train_and_time(device, epochs=2):
    model = SmallCNN().to(device)
    opt = optim.Adam(model.parameters(), lr=1e-3)
    start = time.time()
    for ep in range(epochs):
        model.train()
        for x,y in trainloader:
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            opt.zero_grad()
            loss = F.cross_entropy(model(x), y)
            loss.backward()
            opt.step()
    # IMPORTANT: make sure all CUDA work is finished before timing
    if device.type == "cuda":
        torch.cuda.synchronize()
    train_time = time.time() - start

    # quick test accuracy
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x,y in testloader:
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            logits = model(x)
            pred = logits.argmax(1)
            correct += (pred==y).sum().item()
            total += y.size(0)
    acc = 100*correct/total
    return train_time, acc


## Running CPU vs GPU

In [9]:
cpu_epochs = 2
gpu_epochs = 20

cpu_time, cpu_acc = train_and_time(torch.device("cpu"), epochs=cpu_epochs)
print(f"CPU:   {cpu_time:.1f}s total for {cpu_epochs} ep, acc={cpu_acc:.2f}%")

if torch.cuda.is_available():
    gpu_time, gpu_acc = train_and_time(torch.device("cuda"), epochs=gpu_epochs)
    print(f"GPU:   {gpu_time:.1f}s total for {gpu_epochs} ep, acc={gpu_acc:.2f}%")

    # seconds per epoch
    cpu_sec_per_ep = cpu_time / cpu_epochs
    gpu_sec_per_ep = gpu_time / gpu_epochs

    print(f"CPU: {cpu_sec_per_ep:.2f}s/epoch")
    print(f"GPU: {gpu_sec_per_ep:.2f}s/epoch")

    # apples-to-apples speedup
    print(f"Speedup (per-epoch): {cpu_sec_per_ep / gpu_sec_per_ep:.1f}×")

CPU:   133.8s total for 2 ep, acc=48.71%
GPU:   242.9s total for 20 ep, acc=72.05%
CPU: 66.90s/epoch
GPU: 12.14s/epoch
Speedup (per-epoch): 5.5×
