In [1]:
import torch

In [5]:
import os
from pathlib import Path

cuda_source = Path("empty_kernel.cu").read_text()
cpp_source = """
torch::Tensor my_empty(torch::Tensor& input);
torch::Tensor my_empty_out(torch::Tensor& input, torch::Tensor output);
"""
# You may need to check the line below
os.environ["CUDA_HOME"] = "/public/apps/cuda/12.1"

In [6]:
from torch.utils.cpp_extension import load_inline

module = load_inline(
    name="my_empty",
    cpp_sources=cpp_source,
    cuda_sources=cuda_source,
    functions=["my_empty", "my_empty_out"],
    with_cuda=True,
    extra_cuda_cflags=["-O2"],
    # build_directory='./cuda_build',
)

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [7]:
torch.manual_seed(42)
x = torch.randn(1024, 1024, device="cuda")

In [8]:
%timeit module.my_empty_out(x, x); torch.cuda.synchronize()

with torch.profiler.profile() as prof:
    for i in range(10_000):
        module.my_empty_out(x, x)
        torch.cuda.synchronize()
print(prof.key_averages().table())

14.4 µs ± 15.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


STAGE:2024-02-22 11:33:54 581800:581800 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-22 11:33:54 581800:581800 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-22 11:33:54 581800:581800 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                        cudaLaunchKernel        78.27%      37.170ms        78.27%      37.170ms       3.717us       0.000us         0.00%       0.000us       0.000us         10000  
    my_empty_kernel(float*, float*, int)         0.00%       0.000us         0.00%       0.000us       0.000us      70.034ms       100.00%      70.034ms       7.003us         10000  
                   cudaDeviceSynchronize        21.73%      10.317ms        21.73%  