In [None]:
###########################################
########      PYTORCH       ###############
###########################################

In [9]:
#!/usr/bin/env python3
import time
import platform
import torch

# Allow TF32 on matmul/cublas and on cuDNN convs
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Let cuDNN autotune the best algorithm
torch.backends.cudnn.benchmark = True

try:
    import pynvml
    pynvml.nvmlInit()
    use_nvml = True
except ImportError:
    use_nvml = False

REFERENCES = {
    "device_name":    "NVIDIA GeForce RTX 5080 Laptop GPU",
    "compute_cap":    "(12, 0)",          # Blackwell sm_120
    "total_mem_gb":   (12, 24),           # GB

    # WSL2 real‐world → CPU↔GPU BW
    "bw_cpu_gpu":     (10000, 20000),       # MB/s
    "bw_gpu_cpu":     (10000, 30000),     # MB/s

    # Native‐Linux specs for on‐GPU tests:
    "mem_triad":      (400_000, 600_000), # MB/s
    "vec_add":        (50, 100),          # GEOPS
    "mat32":          (20000, 40000),     # GFLOPS
    "mat16":          (40000, 80000),     # GFLOPS
    "conv2d":         (20000, 40000),     # imgs/s
    "lstm":           (2.0, 6.0),         # ms
}


def header(title):
    print(f"\n{title}\n{'='*len(title)}")

def report_environment():
    header("ENVIRONMENT")
    print(f"Python       {platform.python_version()}")
    print(f"OS           {platform.system()} {platform.release()}")
    print(f"Reference    {REFERENCES['device_name']} specs")
    if use_nvml:
        drv = pynvml.nvmlSystemGetDriverVersion().decode()
        print(f"Driver       {drv}")
        cnt = pynvml.nvmlDeviceGetCount()
        print(f"GPU Count    {cnt}")
        for i in range(cnt):
            h = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(h).decode()
            cap = torch.cuda.get_device_capability(i)
            mem = pynvml.nvmlDeviceGetMemoryInfo(h)
            print(f"Device[{i}]   {name}")
            print(f"  ComputeCap  {cap}  Reference {REFERENCES['compute_cap']}")
            print(f"  Memory      {mem.total/1e9:.1f} GB  Expected {REFERENCES['total_mem_gb']} GB")
    else:
        cnt = torch.cuda.device_count()
        print(f"GPU Count    {cnt}")
        for i in range(cnt):
            name = torch.cuda.get_device_name(i)
            cap = torch.cuda.get_device_capability(i)
            props = torch.cuda.get_device_properties(i)
            print(f"Device[{i}]   {name}")
            print(f"  ComputeCap  {cap}  Reference {REFERENCES['compute_cap']}")
            print(f"  Memory      {props.total_memory/1e9:.1f} GB  Expected {REFERENCES['total_mem_gb']} GB")

def measure_bandwidth(src, dst, size_mb=512):
    # calculate exact number of float32 elements for size_mb bytes
    n = size_mb * 1024 * 1024 // 4
    x = torch.randn(n, device=src)
    if src == "cpu":
        x = x.pin_memory()
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    _ = x.to(dst, non_blocking=True)
    torch.cuda.synchronize()
    return size_mb / (time.perf_counter() - t0)

def cpu_to_gpu_bw():
    header("1) CPU → GPU Bandwidth")
    lo, hi = REFERENCES["bw_cpu_gpu"]
    bw = measure_bandwidth("cpu", "cuda", size_mb=512)
    print(f"Measured     {bw:.0f} MB/s  Expected {lo:.0f}–{hi:.0f} MB/s")

def gpu_to_cpu_bw():
    header("2) GPU → CPU Bandwidth")
    size_mb = 512
    n = size_mb * 1024 * 1024 // 4
    x = torch.randn(n, device="cuda")
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    _ = x.to("cpu", non_blocking=True)
    torch.cuda.synchronize()
    bw = size_mb / (time.perf_counter() - t0)
    lo, hi = REFERENCES["bw_gpu_cpu"]
    print(f"Measured     {bw:.0f} MB/s  Expected {lo:.0f}–{hi:.0f} MB/s")

def memory_triad():
    header("3) GPU Memory Triad (A+B*scalar)")
    n = 200_000_000
    a = torch.randn(n, device="cuda")
    b = torch.randn(n, device="cuda")
    scalar = 0.123
    for _ in range(3):
        _ = a + b * scalar
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(10):
        _ = a + b * scalar
    torch.cuda.synchronize()
    dt = (time.perf_counter() - t0)/10
    mb = 3 * n * 4 / 1e6
    bw = mb / dt
    lo, hi = REFERENCES["mem_triad"]
    print(f"Measured     {bw:.0f} MB/s  Expected {lo:.0f}–{hi:.0f} MB/s")

def vector_add():
    header("4) GPU Vector Add")
    n = 100_000_000
    a = torch.randn(n, device="cuda")
    b = torch.randn(n, device="cuda")
    for _ in range(3):
        _ = a + b
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(10):
        _ = a + b
    torch.cuda.synchronize()
    dt = (time.perf_counter() - t0)/10
    geops = n / dt / 1e9
    lo, hi = REFERENCES["vec_add"]
    print(f"Measured     {geops:.0f} GEOPS  Expected {lo:.0f}–{hi:.0f} GEOPS")

def matmul_bench(dtype, label, key, steps=20):
    a = torch.randn(2048, 2048, device="cuda", dtype=dtype)
    b = torch.randn(2048, 2048, device="cuda", dtype=dtype)
    for _ in range(5): _ = a @ b
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(steps): _ = a @ b
    torch.cuda.synchronize()
    dt = (time.perf_counter() - t0) / steps
    gflops = 2 * 2048**3 / (dt * 1e9)
    lo, hi = REFERENCES[key]
    print(f"{label}: Measured {gflops:.0f} GFLOPS  Expected {lo:.0f}–{hi:.0f} GFLOPS")

def conv2d_bench():
    header("5) cuDNN Conv2D Throughput")
    inp = torch.randn(32, 3, 224, 224, device="cuda")
    conv = torch.nn.Conv2d(3, 64, 7, stride=2, padding=3).cuda()
    for _ in range(5): conv(inp)
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(20): conv(inp)
    torch.cuda.synchronize()
    dt = (time.perf_counter() - t0) / 20
    imgs_s = 32 / dt
    lo, hi = REFERENCES["conv2d"]
    print(f"Measured     {imgs_s:.0f} imgs/s  Expected {lo:.0f}–{hi:.0f} imgs/s")

def lstm_bench():
    header("6) cuDNN-Fused LSTM Latency")
    B, S, F, H = 64, 100, 128, 512
    m = torch.nn.LSTM(F, H, batch_first=True).cuda()
    inp = torch.randn(B, S, F, device="cuda")
    for _ in range(3): m(inp)
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(10): m(inp)
    torch.cuda.synchronize()
    ms = (time.perf_counter() - t0) * 1e3 / 10
    lo, hi = REFERENCES["lstm"]
    print(f"Measured     {ms:.1f} ms  Expected {lo:.1f}–{hi:.1f} ms")

def run_benchmark(mat_size, dtype, n_iters=50):
    # Prepare random matrices
    A = torch.randn((mat_size, mat_size), device='cuda', dtype=dtype)
    B = torch.randn((mat_size, mat_size), device='cuda', dtype=dtype)
    torch.cuda.synchronize()
    # Warmup
    for _ in range(5):
        _ = torch.mm(A, B)
    torch.cuda.synchronize()
    # Timed loop
    start = time.perf_counter()
    for _ in range(n_iters):
        _ = torch.mm(A, B)
    torch.cuda.synchronize()
    elapsed = time.perf_counter() - start
    # Compute TFLOPS: 2·N³ operations per multiplication
    ops = 2 * (mat_size ** 3) * n_iters
    tflops = ops / elapsed / 1e12
    return tflops

for dtype in (torch.float32, torch.float16):
    tflops = run_benchmark(4096, dtype)
    print(f"{dtype}: {tflops:.2f} TFLOPS")

# Run all tests
report_environment()
cpu_to_gpu_bw()
gpu_to_cpu_bw()
memory_triad()
vector_add()
header("7) MATRIX MULTIPLY THROUGHPUT")
matmul_bench(torch.float32, "FP32", "mat32")
matmul_bench(torch.float16, "FP16", "mat16")
conv2d_bench()
lstm_bench()


torch.float32: 34.81 TFLOPS
torch.float16: 73.34 TFLOPS

ENVIRONMENT
Python       3.12.3
OS           Linux 6.6.87.2-microsoft-standard-WSL2
Reference    NVIDIA GeForce RTX 5080 Laptop GPU specs
Driver       576.80
GPU Count    1
Device[0]   NVIDIA GeForce RTX 5080 Laptop GPU
  ComputeCap  (12, 0)  Reference (12, 0)
  Memory      17.1 GB  Expected (12, 24) GB

1) CPU → GPU Bandwidth
Measured     26901 MB/s  Expected 10000–20000 MB/s

2) GPU → CPU Bandwidth
Measured     26457 MB/s  Expected 10000–30000 MB/s

3) GPU Memory Triad (A+B*scalar)
Measured     316972 MB/s  Expected 400000–600000 MB/s

4) GPU Vector Add
Measured     45 GEOPS  Expected 50–100 GEOPS

7) MATRIX MULTIPLY THROUGHPUT
FP32: Measured 33693 GFLOPS  Expected 20000–40000 GFLOPS
FP16: Measured 64630 GFLOPS  Expected 40000–80000 GFLOPS

5) cuDNN Conv2D Throughput
Measured     30683 imgs/s  Expected 20000–40000 imgs/s

6) cuDNN-Fused LSTM Latency
Measured     2.2 ms  Expected 2.0–6.0 ms


In [None]:
###########################################
########      TENSORFLOW    ###############
###########################################

In [5]:
#!/usr/bin/env python3
import time, platform
import numpy as np
import tensorflow as tf

# get the dict of build‐time settings
build = tf.sysconfig.get_build_info()

print("TF version   ", tf.__version__)
print("CUDA version ", build["cuda_version"])
print("cuDNN version", build["cudnn_version"])

# list visible GPUs
gpus = tf.config.list_physical_devices("GPU")
print("GPUs          ", gpus)

# and per‐GPU details (compute capability, name) via the device details API
if gpus:
    details = tf.config.experimental.get_device_details(gpus[0])
    print("Device name  ", details["device_name"])
    print("Compute cap. ", details["compute_capability"])

#!/usr/bin/env python3
import time, platform
import numpy as np
import tensorflow as tf

# EXACTLY the same spec‐based ranges as PyTorch
REFERENCES = {
    "bw_cpu_gpu": (10000, 20000),   # MB/s (WSL2 measured)
    "bw_gpu_cpu": (10000,30000),   # MB/s
    "mem_triad":  (400_000,600_000),# MB/s
    "vec_add":    (50,  100),      # GEOPS
    "mat32":      (20000,40000),   # GFLOPS
    "mat16":      (40000,80000),   # GFLOPS
    "conv2d":     (20000,40000),   # imgs/s
    "lstm":       (2.0,   6.0),    # ms
}

def header(title):
    print(f"\n{title}\n{'='*len(title)}")

def report_env():
    header("ENVIRONMENT")
    print("Python     ", platform.python_version())
    print("OS         ", platform.system(), platform.release())
    print("TensorFlow ", tf.__version__)
    build = tf.sysconfig.get_build_info()
    print("CUDA       ", build["cuda_version"])
    print("cuDNN      ", build["cudnn_version"])
    gpus = tf.config.list_physical_devices("GPU")
    print("GPU Count  ", len(gpus))
    if gpus:
        d = tf.config.experimental.get_device_details(gpus[0])
        print("Device     ", d["device_name"], "cap=", d["compute_capability"])

def cpu_to_gpu_bw(size_mb=512):
    header("1) CPU → GPU Bandwidth")
    # 512 MB of float32 → element count
    n = size_mb * 1024 * 1024 // 4
    host = np.random.rand(n).astype(np.float32)

    t0 = time.perf_counter()
    # force copy onto GPU
    with tf.device("/GPU:0"):
        dev = tf.constant(host)
    _ = dev.numpy()   # sync back
    bw = size_mb / (time.perf_counter() - t0)

    lo, hi = REFERENCES["bw_cpu_gpu"]
    print(f"Measured   {bw:.0f} MB/s  Expected {lo}–{hi}")

def gpu_to_cpu_bw(size_mb=512):
    header("2) GPU → CPU Bandwidth")
    n = size_mb * 1024 * 1024 // 4
    with tf.device("/GPU:0"):
        dev = tf.random.uniform([n], dtype=tf.float32)

    t0 = time.perf_counter()
    # pull back to host
    host = dev.numpy()
    bw = size_mb / (time.perf_counter() - t0)

    lo, hi = REFERENCES["bw_gpu_cpu"]
    print(f"Measured   {bw:.0f} MB/s  Expected {lo}–{hi}")

def memory_triad():
    header("3) GPU Memory Triad (A + B*scalar)")
    n = 200_000_000
    with tf.device("/GPU:0"):
        A = tf.random.normal([n])
        B = tf.random.normal([n])
    scalar = 0.123

    # warm-up
    for _ in range(3):
        _ = A + B * scalar
    t0 = time.perf_counter()
    for _ in range(10):
        _ = A + B * scalar
    dt = (time.perf_counter() - t0) / 10

    # 3 streams × n elements × 4 bytes → MB
    mb = 3 * n * 4 / 1e6
    bw = mb / dt
    lo, hi = REFERENCES["mem_triad"]
    print(f"Measured   {bw:.0f} MB/s  Expected {lo}–{hi}")

def vector_add():
    header("4) GPU Vector Add")
    n = 100_000_000
    with tf.device("/GPU:0"):
        A = tf.random.normal([n])
        B = tf.random.normal([n])

    # warm-up
    for _ in range(3):
        _ = A + B
    t0 = time.perf_counter()
    for _ in range(10):
        _ = A + B
    dt = (time.perf_counter() - t0) / 10

    geops = n / dt / 1e9
    lo, hi = REFERENCES["vec_add"]
    print(f"Measured   {geops:.0f} GEOPS  Expected {lo}–{hi}")

def matmul_bench():
    header("5) MATRIX MULTIPLY THROUGHPUT")
    for label, dtype, key in [("FP32", tf.float32, "mat32"),
                              ("FP16", tf.float16, "mat16")]:
        with tf.device("/GPU:0"):
            A = tf.random.normal([2048,2048], dtype=dtype)
            B = tf.random.normal([2048,2048], dtype=dtype)
        # warm-up
        for _ in range(5):
            _ = tf.linalg.matmul(A, B).numpy()
        # timed
        t0 = time.perf_counter()
        for _ in range(20):
            _ = tf.linalg.matmul(A, B).numpy()
        dt = (time.perf_counter() - t0) / 20

        # GFLOPS = 2·N³ / (dt·1e9)
        gflops = 2 * 2048**3 / (dt * 1e9)
        lo, hi = REFERENCES[key]
        print(f"{label}:     {gflops:.0f} GFLOPS  Expected {lo}–{hi}")

def conv2d_bench():
    header("6) cuDNN Conv2D Throughput")
    with tf.device("/GPU:0"):
        X = tf.random.normal([32,224,224,3])
        conv = tf.keras.layers.Conv2D(64, 7, strides=2, padding="same")
    # warm-up
    for _ in range(5):
        _ = conv(X).numpy()
    # timed
    t0 = time.perf_counter()
    for _ in range(20):
        _ = conv(X).numpy()
    dt = (time.perf_counter() - t0) / 20

    imgs = 32 / dt
    lo, hi = REFERENCES["conv2d"]
    print(f"Measured   {imgs:.0f} imgs/s  Expected {lo}–{hi}")

def lstm_bench():
    header("7) cuDNN-Fused LSTM Latency")
    try:
        with tf.device("/GPU:0"):
            lstm = tf.compat.v1.keras.layers.CuDNNLSTM(512)
            X    = tf.random.normal([64,100,128])
        # warm-up
        for _ in range(3):
            _ = lstm(X).numpy()
        # timed
        t0 = time.perf_counter()
        for _ in range(10):
            _ = lstm(X).numpy()
        ms = (time.perf_counter() - t0) * 1e3 / 10

        lo, hi = REFERENCES["lstm"]
        print(f"Measured   {ms:.1f} ms     Expected {lo:.1f}–{hi:.1f} ms")
    except Exception as e:
        print("⚠️  LSTM skipped:", e)

report_env()
cpu_to_gpu_bw()
gpu_to_cpu_bw()
matmul_bench()
conv2d_bench()
lstm_bench()


TF version    2.17.0
CUDA version  12.8
cuDNN version 9
GPUs           [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Device name   NVIDIA GeForce RTX 5080 Laptop GPU
Compute cap.  (12, 0)

ENVIRONMENT
Python      3.12.3
OS          Linux 6.6.87.2-microsoft-standard-WSL2
TensorFlow  2.17.0
CUDA        12.8
cuDNN       9
GPU Count   1
Device      NVIDIA GeForce RTX 5080 Laptop GPU cap= (12, 0)

1) CPU → GPU Bandwidth


I0000 00:00:1751754172.988245      81 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1751754172.989065      81 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


Measured   829 MB/s  Expected 10000–20000

2) GPU → CPU Bandwidth
Measured   1996 MB/s  Expected 10000–30000

5) MATRIX MULTIPLY THROUGHPUT
FP32:     1589 GFLOPS  Expected 20000–40000
FP16:     4203 GFLOPS  Expected 40000–80000

6) cuDNN Conv2D Throughput
Measured   546 imgs/s  Expected 20000–40000

7) cuDNN-Fused LSTM Latency
⚠️  LSTM skipped: Exception encountered when calling layer 'cu_dnnlstm_2' (type CuDNNLSTM).

{{function_node __wrapped__CudnnRNNV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} No algorithm worked!
	 [[{{node CudnnRNNV2}}]] [Op:CudnnRNNV2]

Call arguments received by layer 'cu_dnnlstm_2' (type CuDNNLSTM):
  • inputs=tf.Tensor(shape=(64, 100, 128), dtype=float32)
  • mask=None
  • training=None
  • initial_state=None


2025-07-05 22:22:57.895046: W tensorflow/core/framework/op_kernel.cc:1840] OP_REQUIRES failed at cudnn_rnn_ops.cc:1756 : INTERNAL: No algorithm worked!
