In [1]:
import torch
import time
from torch import nn
from torch.cuda import amp
try:
    import pynvml
    pynvml.nvmlInit()
    NVML = True
except ImportError:
    NVML = False

# ──────────────────────────────────────────────────────────────────────────────
# 1) Basic device & library versions
# ──────────────────────────────────────────────────────────────────────────────
print("PyTorch version:",      torch.__version__)
print("Built with CUDA:",      torch.version.cuda)
print("cuDNN version:",        torch.backends.cudnn.version())
print("Device count:",         torch.cuda.device_count())
print("Device name:",          torch.cuda.get_device_name(0))
print("Compute capability:",   torch.cuda.get_device_capability(0))
print()

# ──────────────────────────────────────────────────────────────────────────────
# 2) Memory info (using NVML if available)
# ──────────────────────────────────────────────────────────────────────────────
if NVML:
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print(f"Total memory: {meminfo.total/1e9:.2f} GB")
    print(f"Free memory : {meminfo.free/1e9:.2f} GB")
    print(f"Used memory : {meminfo.used/1e9:.2f} GB")
else:
    tot, free = torch.cuda.get_device_properties(0).total_memory, torch.cuda.mem_get_info()[0]
    print(f"Total memory: {tot/1e9:.2f} GB  Free (approx): {free/1e9:.2f} GB")
print()

# ──────────────────────────────────────────────────────────────────────────────
# 3) CPU→GPU bandwidth test
# ──────────────────────────────────────────────────────────────────────────────
def cpu_to_gpu_bw(size_mb=512):
    x = torch.randn(size_mb*256*1024//4, device="cpu")  # ~size_mb MB
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    y = x.to("cuda")
    torch.cuda.synchronize()
    t1 = time.perf_counter()
    return size_mb / (t1 - t0), y  # MB/s

bw, _ = cpu_to_gpu_bw(512)
print(f"CPU→GPU bandwidth: {bw:.0f} MB/s")
print()

# ──────────────────────────────────────────────────────────────────────────────
# 4) Throughput: FP32 vs FP16 matrix-multiply
# ──────────────────────────────────────────────────────────────────────────────
def matmul_bench(dtype, steps=20):
    torch.cuda.empty_cache()
    a = torch.randn(2048, 2048, device="cuda", dtype=dtype)
    b = torch.randn(2048, 2048, device="cuda", dtype=dtype)
    # Warm-up
    for _ in range(5):
        _ = a @ b
    torch.cuda.synchronize()
    # Timed
    t0 = time.perf_counter()
    for _ in range(steps):
        c = a @ b
    torch.cuda.synchronize()
    t1 = time.perf_counter()
    t_avg = (t1 - t0) / steps
    gflops = 2 * 2048**3 / (t_avg * 1e9)
    return t_avg*1e3, gflops

t32, g32 = matmul_bench(torch.float32)
t16, g16 = matmul_bench(torch.float16)
print(f"FP32 matmul: {t32:.1f} ms ≈ {g32:.0f} GFLOPS")
print(f"FP16 matmul: {t16:.1f} ms ≈ {g16:.0f} GFLOPS (Tensor Cores)")
print()

# ──────────────────────────────────────────────────────────────────────────────
# 5) cuDNN-fused LSTM throughput
# ──────────────────────────────────────────────────────────────────────────────
batch, seq, feat, hid = 64, 100, 128, 512
lstm = nn.LSTM(feat, hid, batch_first=True).cuda()
inp  = torch.randn(batch, seq, feat, device="cuda")
torch.cuda.synchronize()
# Warm-up
for _ in range(3):
    out, _ = lstm(inp)
torch.cuda.synchronize()
# Timed
t0 = time.perf_counter()
for _ in range(10):
    out, _ = lstm(inp)
torch.cuda.synchronize()
t1 = time.perf_counter()
t_avg = (t1 - t0)/10
print(f"LSTM forward: {t_avg*1e3:.1f} ms (batch={batch}, seq={seq})")
print()

# ──────────────────────────────────────────────────────────────────────────────
# 6) Simple profiler demo
# ──────────────────────────────────────────────────────────────────────────────
print("Profiling a small mixed-precision block:")
with torch.profiler.profile(
        schedule=torch.profiler.schedule(
            wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler("./profiler"),
        record_shapes=True,
        with_stack=True) as prof:
    scaler = amp.GradScaler()
    optimizer = torch.optim.Adam(lstm.parameters(), lr=1e-3)
    for step in range(5):
        optimizer.zero_grad()
        with amp.autocast():
            out, _ = lstm(inp)
            loss = out.mean()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        prof.step()
print("Profile saved to ./profiler – view with TensorBoard")


PyTorch version: 2.4.0a0+3bcc3cddb5.nv24.07
Built with CUDA: 12.5
cuDNN version: 90201
Device count: 1
Device name: NVIDIA GeForce RTX 5080 Laptop GPU
Compute capability: (12, 0)

Total memory: 17.09 GB
Free memory : 16.68 GB
Used memory : 0.42 GB



NVIDIA GeForce RTX 5080 Laptop GPU with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_70 sm_72 sm_75 sm_80 sm_86 sm_87 sm_90 compute_90.
If you want to use the NVIDIA GeForce RTX 5080 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



CPU→GPU bandwidth: 24797 MB/s

FP32 matmul: 0.6 ms ≈ 30029 GFLOPS
FP16 matmul: 1.2 ms ≈ 13756 GFLOPS (Tensor Cores)

LSTM forward: 11.6 ms (batch=64, seq=100)

Profiling a small mixed-precision block:


  scaler = amp.GradScaler()
  with amp.autocast():
INFO:2025-07-03 16:00:41 58:58 init.cpp:177] If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti


Profile saved to ./profiler – view with TensorBoard
