# PyTorch 프로파일러(Profiler)

PyTorch프로파일러는 PyTorch 모델의 성능을 분석하는데 사용된다.

## 개요

> PyTorch는 사용자가 모델 내의 연산 비용이 큰(expensive) 연산자들이 무엇인지 알고싶을 때 유용하게 사용할 수 있는 간단한 프로파일러 API를 포함하고 있습니다.
> [출처: PyTorch](https://tutorials.pytorch.kr/recipes/recipes/profiler_recipe.html#id1)

In [1]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
# 간단한 ResNet 모델을 정의합니다.

model = models.resnet18()
inputs = torch.randn(5, 3, 244, 244)

In [8]:
# 모델을 실행하면서 프로파일링을 수행합니다.
with profile(activities=[ProfilerActivity.CPU],record_shapes=True) as prof:
  with record_function("model_inference"):
    model(inputs)

STAGE:2024-02-22 23:05:29 54014:54014 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-22 23:05:29 54014:54014 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-22 23:05:29 54014:54014 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [9]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         2.74%       1.796ms       100.00%      65.526ms      65.526ms             1  
                     aten::conv2d         3.75%       2.460ms        77.10%      50.522ms       2.526ms            20  
                aten::convolution         0.28%     186.000us        76.22%      49.942ms       2.497ms            20  
               aten::_convolution         0.17%     111.000us        75.93%      49.756ms       2.488ms            20  
         aten::mkldnn_convolution        75.06%      49.182ms        75.76%      49.645ms       2.482ms            20  
                 aten::batch_norm       

In [10]:
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls                                                                      Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                  model_inference         2.74%       1.796ms       100.00%      65.526ms      65.526ms             1                                                                                []  
                     aten::conv2d         0.80%     527.000us        19.48%      12.765ms      12.765ms             1                             [[5, 3, 244, 244], [64, 3, 7, 7], [], [], [], 

In [16]:
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()

with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference        26.79%     625.000us       100.00%       2.333ms       2.333ms             1  
                     aten::conv2d         6.17%     144.000us        28.80%     672.000us      33.600us            20  
                aten::convolution         3.17%      74.000us        27.95%     652.000us      32.600us            20  
               aten::_convolution         1.80%      42.000us        24.77%     578.000us      28.900us            20  
          aten::cudnn_convolution        22.97%     536.000us        22.97%     536.000us      26.800us            20  
                       aten::add_       

STAGE:2024-02-22 23:12:02 54014:54014 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-22 23:12:02 54014:54014 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-22 23:12:02 54014:54014 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [17]:
# 프로파일러를 사용하여 메모리 소비 분석하기

model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU],
        profile_memory=True, record_shapes=True) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.49%     256.000us         0.49%     256.000us       1.280us      83.36 Mb      83.36 Mb           200  
    aten::max_pool2d_with_indices         5.39%       2.810ms         5.39%       2.810ms       2.810ms      11.48 Mb      11.48 Mb             1  
                 aten::empty_like         0.08%      41.000us         0.14%      71.000us       3.550us      47.37 Mb       7.66 Mb            20  
     aten::_batch_norm_impl_index         0.55%     289.000us         6.92%       3.605ms     180.250us      47.

STAGE:2024-02-22 23:12:32 54014:54014 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-22 23:12:32 54014:54014 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-22 23:12:32 54014:54014 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [18]:
print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.49%     256.000us         0.49%     256.000us       1.280us      83.36 Mb      83.36 Mb           200  
                 aten::batch_norm        -0.28%    -144.000us         7.01%       3.652ms     182.600us      47.41 Mb      -3.83 Mb            20  
     aten::_batch_norm_impl_index         0.55%     289.000us         6.92%       3.605ms     180.250us      47.41 Mb       3.83 Mb            20  
          aten::native_batch_norm         6.41%       3.341ms         6.72%       3.501ms     175.050us      47.

In [21]:
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    model(inputs)

prof.export_chrome_trace("trace.json")

STAGE:2024-02-22 23:16:21 54014:54014 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-22 23:16:21 54014:54014 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-22 23:16:21 54014:54014 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [20]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    with_stack=True,
) as prof:
    model(inputs)

# Print aggregated stats
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::conv2d         6.63%     157.000us        35.26%     835.000us      41.750us            20  
                aten::convolution         3.46%      82.000us        33.99%     805.000us      40.250us            20  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.368ms



STAGE:2024-02-22 23:14:31 54014:54014 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-02-22 23:14:31 54014:54014 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-02-22 23:14:31 54014:54014 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
