# mini-cuDNN: Conv2d Validation & Benchmark Notebook

**What I am doing here**
1. Verify CUDA/PyTorch environment
2. Install my local repo as an editable package (`pip install -e .`)
3. Validate correctness vs `torch.nn.functional.conv2d`
4. Benchmark my kernel and compare to PyTorch native

Since I am writing this on local first, there are some things that I need to keep in mind for google colab.

- Upload to Google Drive
- Mount Drive into Colab
- cd into project folder
- Setup via bash commands
- Run .ipynb

In [None]:
import torch, platform, sys, subprocess, os, time

print(f"Python: {sys.version.split()[0]}  |  Platform: {platform.platform()}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
    print(f"Torch CUDA version: {torch.version.cuda}")

To install the extension we need to first make sure we are at the root of the repo (`mydnn/` must be accessible).

In [None]:
# Change into your repo folder if needed.
# If you cloned above, use TARGET_DIR; otherwise, set REPO_PATH manually.
REPO_PATH = TARGET_DIR if os.path.exists(TARGET_DIR) else "."

print("Using repo path:", os.path.abspath(REPO_PATH))
os.chdir(REPO_PATH)

# Install editable
subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "."])

### Comparing vs. `torch.nn.functional.conv2d`

Here, we test this against a couple of different shapes just as a first run check.

In [None]:
import torch
import torch.nn.functional as F
import random

import mydnn

def check_case(N=2, C=3, H=16, W=17, K=4, R=3, S=3, sh=1, sw=1, ph=1, pw=1, seed=0):
    torch.manual_seed(seed)
    x = torch.randn(N, C, H, W, device="cuda", dtype=torch.float32)
    w = torch.randn(K, C, R, S, device="cuda", dtype=torch.float32)
    y_ref = F.conv2d(x, w, stride=(sh, sw), padding=(ph, pw))
    y = mydnn.conv2d_naive(x, w, sh, sw, ph, pw)
    max_abs = (y - y_ref).abs().max().item()
    max_rel = ((y - y_ref).abs() / (y_ref.abs() + 1e-6)).max().item()
    print(f"Shape NCHW=({N},{C},{H},{W}) KRS=({K},{R},{S}) stride=({sh},{sw}) pad=({ph},{pw})  -> max_abs={max_abs:.2e}, max_rel={max_rel:.2e}")
    return max_abs, max_rel

# Run a few checks
cases = [
    (1,1,8,8, 1,3,3, 1,1, 0,0),
    (2,3,16,17, 4,3,3, 1,1, 1,1),
    (4,8,15,13, 8,5,5, 2,2, 1,1),
]
for (N,C,H,W,K,R,S, sh,sw, ph,pw) in cases:
    check_case(N,C,H,W,K,R,S, sh,sw, ph,pw, seed=42)

## Benchmarking my layer vs. PyTorch native

Here I am benchmarking the time it takes to crunch a conv2d using my layer vs native PyTorch.

In [None]:
import math
import time
import torch

def time_it(fn, iters=20, warmup=5):
    # Warmup
    for _ in range(warmup):
        _ = fn()
    torch.cuda.synchronize()
    t0 = time.time()
    for _ in range(iters):
        _ = fn()
    torch.cuda.synchronize()
    dt_ms = (time.time() - t0) * 1000.0 / iters
    return dt_ms

def flops_conv2d(N,C,H,W,K,R,S, Ho,Wo):
    # Approx FLOPs for conv: N * Ho * Wo * K * (C*R*S*2)  (mul+add ~ 2 ops)
    return N * Ho * Wo * K * (C * R * S * 2.0)

def bench_case(N,C,H,W,K,R,S, sh,sw, ph,pw, seed=0):
    torch.manual_seed(seed)
    x = torch.randn(N, C, H, W, device="cuda", dtype=torch.float32)
    w = torch.randn(K, C, R, S, device="cuda", dtype=torch.float32)

    Ho = (H + 2*ph - R)//sh + 1
    Wo = (W + 2*pw - S)//sw + 1

    # Capture callables
    def call_mydnn():
        return mydnn.conv2d_naive(x, w, sh, sw, ph, pw)
    def call_torch():
        return torch.nn.functional.conv2d(x, w, stride=(sh, sw), padding=(ph, pw))

    t_mydnn = time_it(call_mydnn)
    t_torch = time_it(call_torch)

    gflops = flops_conv2d(N,C,H,W,K,R,S, Ho,Wo) / 1e9
    mydnn_gflops = gflops / (t_mydnn / 1000.0)
    torch_gflops = gflops / (t_torch / 1000.0)

    print(f"[NCHW=({N},{C},{H},{W}) KRS=({K},{R},{S}) stride=({sh},{sw}) pad=({ph},{pw})]")
    print(f"  mydnn: {t_mydnn:.2f} ms   ~ {mydnn_gflops:.2f} GFLOP/s")
    print(f"  torch: {t_torch:.2f} ms   ~ {torch_gflops:.2f} GFLOP/s")

    return {
        "N":N, "C":C, "H":H, "W":W, "K":K, "R":R, "S":S,
        "sh":sh, "sw":sw, "ph":ph, "pw":pw,
        "Ho":Ho, "Wo":Wo,
        "t_mydnn_ms": t_mydnn,
        "t_torch_ms": t_torch,
        "mydnn_gflops": mydnn_gflops,
        "torch_gflops": torch_gflops,
    }

## Concluding notes

- The naive kernel that I wrote is ***much*** slower than PyTorch and that makes sense. I wrote a thin conv2d parallelization layer but true PyTorch uses optimized cuDNN/cuBLAS under the hood with an autotuner (smart router).