In [1]:
import torch

In [3]:
a = torch.tensor([1.0, 2.0, 3.0])
a

tensor([1., 2., 3.])

In [4]:
print(torch.square(a))
print(a**2)
print(a * a)

tensor([1., 4., 9.])
tensor([1., 4., 9.])
tensor([1., 4., 9.])


In [8]:
import typing as tp


def measure_square(func: tp.Callable, input: torch.Tensor):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    for _ in range(5):
        func(input)

    start.record()
    func(input)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end)

In [6]:
b = torch.randn(size=(10000, 10000)).cuda()

In [9]:
measure_square(torch.square, b)

2.4678399562835693

In [10]:
def square_as_multiply(a: torch.Tensor):
    return a * a


measure_square(square_as_multiply, b)

2.4770560264587402

In [11]:
def square_as_tensor(a: torch.Tensor):
    return a**2


measure_square(square_as_tensor, b)

2.4688639640808105

In [12]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    torch.square(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
             aten::square        19.16%       1.032ms        51.99%       2.800ms       2.800ms       1.035ms        19.76%       5.239ms       5.239ms             1  
                aten::pow        31.25%       1.683ms        32.51%       1.751ms       1.751ms       4.174ms        79.67%       4.204ms       4.204ms             1  
        aten::result_type         0.07%       4.000us         0.07%       4.000us       4.000us      16.000us         0.31%      16.000us      16.000us        

STAGE:2024-02-19 22:06:51 4172098:4172098 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-19 22:06:51 4172098:4172098 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-19 22:06:51 4172098:4172098 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [13]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    square_as_multiply(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::mul         0.90%     144.000us         1.09%     174.000us     174.000us       2.648ms       100.00%       2.648ms       2.648ms             1  
          cudaEventRecord        84.02%      13.459ms        84.02%      13.459ms       6.729ms       0.000us         0.00%       0.000us       0.000us             2  
         cudaLaunchKernel         0.19%      30.000us         0.19%      30.000us      30.000us       0.000us         0.00%       0.000us       0.000us        

STAGE:2024-02-19 22:07:18 4172098:4172098 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-19 22:07:18 4172098:4172098 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-19 22:07:18 4172098:4172098 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [14]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    square_as_tensor(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::pow        10.99%      83.000us        17.48%     132.000us     132.000us       1.462ms        99.25%       1.473ms       1.473ms             1  
        aten::result_type         0.40%       3.000us         0.40%       3.000us       3.000us       7.000us         0.48%       7.000us       7.000us             1  
                 aten::to         0.00%       0.000us         0.00%       0.000us       0.000us       4.000us         0.27%       4.000us       4.000us        

STAGE:2024-02-19 22:07:20 4172098:4172098 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-19 22:07:20 4172098:4172098 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-19 22:07:20 4172098:4172098 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
