In [1]:
import torch
from torch import nn
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
import cProfile

In [2]:
x = torch.rand(256, 512)
linear = nn.Linear(512, 256, bias=False).eval()
linear.weight.requires_grad = False
# linear.bias.requires_grad = False

with torch.inference_mode():
    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
        linear(x)
print(prof.key_averages().table())

----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
          aten::linear        19.75%     433.000us       100.00%       2.192ms       2.192ms             1  
               aten::t         1.14%      25.000us         2.74%      60.000us      60.000us             1  
       aten::transpose         1.37%      30.000us         1.60%      35.000us      35.000us             1  
      aten::as_strided         0.23%       5.000us         0.23%       5.000us       5.000us             1  
          aten::matmul         6.25%     137.000us        77.51%       1.699ms       1.699ms             1  
              aten::mm        70.99%       1.556ms        71.26%       1.562ms       1.562ms             1  
    aten::resolve_c

STAGE:2023-11-23 19:24:00 81240:81240 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-23 19:24:00 81240:81240 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-23 19:24:00 81240:81240 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [3]:
jit = torch.jit.script(linear)
torch.jit.save(jit, 'linear.pt')

In [4]:
from torch.ao.quantization import get_default_qconfig
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import QConfigMapping

with torch.inference_mode():
    torch.backends.quantized.engine = 'x86'
    qconfig = get_default_qconfig('x86')
    qconfig_mapping = QConfigMapping().set_global(qconfig)
    example_inputs = torch.randn_like(x)
    prepared_model = prepare_fx(linear, qconfig_mapping, example_inputs)
    for _ in range(16):
        prepared_model(torch.randn_like(x))

    quantized_model = convert_fx(prepared_model)

    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
        quantized_model(x)

print(prof.key_averages().table())
print(quantized_model.code)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::quantize_per_tensor         8.48%      89.000us         9.05%      95.000us      95.000us             1  
                       aten::item         1.05%      11.000us         1.24%      13.000us       3.250us             4  
        aten::_local_scalar_dense         0.19%       2.000us         0.19%       2.000us       0.500us             4  
                quantized::linear        81.43%     855.000us        86.57%     909.000us     909.000us             1  
    aten::_empty_affine_quantized         2.57%      27.000us         2.57%      27.000us      27.000us             1  
                    aten::q_scale       

  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,
STAGE:2023-11-23 19:24:02 81240:81240 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-23 19:24:02 81240:81240 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-23 19:24:02 81240:81240 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [5]:
jit = torch.jit.script(quantized_model)
torch.jit.save(jit, 'x86_linear.pt')



In [6]:
from torch.ao.quantization import get_default_qconfig
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import QConfigMapping

with torch.inference_mode():
    torch.backends.quantized.engine = 'fbgemm'
    qconfig = get_default_qconfig('fbgemm')
    qconfig_mapping = QConfigMapping().set_global(qconfig)
    example_inputs = torch.randn_like(x)
    prepared_model = prepare_fx(linear, qconfig_mapping, example_inputs)
    for _ in range(16):
        prepared_model(torch.randn_like(x))

    quantized_model = convert_fx(prepared_model)

    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
        quantized_model(x)

print(prof.key_averages().table())
print(quantized_model.code)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::quantize_per_tensor        49.00%     415.000us        50.06%     424.000us     424.000us             1  
                       aten::item         1.06%       9.000us         1.53%      13.000us       3.250us             4  
        aten::_local_scalar_dense         0.47%       4.000us         0.47%       4.000us       1.000us             4  
                quantized::linear        46.04%     390.000us        47.70%     404.000us     404.000us             1  
    aten::_empty_affine_quantized         0.59%       5.000us         0.59%       5.000us       5.000us             1  
                    aten::q_scale       

STAGE:2023-11-23 19:24:05 81240:81240 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-23 19:24:05 81240:81240 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-23 19:24:05 81240:81240 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [7]:
jit = torch.jit.script(quantized_model)
torch.jit.save(jit, 'fbgemm_linear.pt')

In [4]:
from torch.ao.quantization import get_default_qconfig
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import QConfigMapping

with torch.inference_mode():

    qconfig = get_default_qconfig('fbgemm')
    qconfig_mapping = QConfigMapping().set_global(qconfig)
    example_inputs = torch.randn_like(x)
    prepared_model = prepare_fx(linear, qconfig_mapping, example_inputs)
    for _ in range(16):
        prepared_model(torch.randn_like(x))

    quantized_model = convert_fx(prepared_model)

    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
        quantized_model(x)

print(prof.key_averages().table())
print(quantized_model.code)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::quantize_per_tensor        85.14%       3.369ms        85.39%       3.379ms       3.379ms             1  
                       aten::item         0.38%      15.000us         0.51%      20.000us       5.000us             4  
        aten::_local_scalar_dense         0.15%       6.000us         0.15%       6.000us       1.500us             4  
                quantized::linear        12.56%     497.000us        13.19%     522.000us     522.000us             1  
    aten::_empty_affine_quantized         0.28%      11.000us         0.28%      11.000us      11.000us             1  
                    aten::q_scale       

STAGE:2023-11-20 19:58:42 112085:112085 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-20 19:58:42 112085:112085 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-20 19:58:42 112085:112085 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [5]:
jit = torch.jit.script(quantized_model)
# torch.jit.save(jit, 'fbgemm_linear.pt')



In [6]:
print(jit.graph)

graph(%self : __torch__.torch.fx.graph_module.GraphModule,
      %input.1 : Tensor):
  %8 : int = prim::Constant[value=13]() # <eval_with_key>.15:8:98
  %_input_scale_0.1 : Tensor = prim::GetAttr[name="_input_scale_0"](%self)
  %_input_zero_point_0.1 : Tensor = prim::GetAttr[name="_input_zero_point_0"](%self)
  %quantize_per_tensor.1 : Tensor = aten::quantize_per_tensor(%input.1, %_input_scale_0.1, %_input_zero_point_0.1, %8) # <eval_with_key>.15:8:26
  %_packed_weight_0.1 : __torch__.torch.classes.quantized.LinearPackedParamsBase = prim::GetAttr[name="_packed_weight_0"](%self)
  %_scale_1.1 : Tensor = prim::GetAttr[name="_scale_1"](%self)
  %_zero_point_1.1 : Tensor = prim::GetAttr[name="_zero_point_1"](%self)
  %21 : float = aten::FloatImplicit(%_scale_1.1) # <eval_with_key>.15:12:13
  %22 : int = aten::IntImplicit(%_zero_point_1.1) # <eval_with_key>.15:12:13
  %linear.1 : Tensor = quantized::linear(%quantize_per_tensor.1, %_packed_weight_0.1, %21, %22) # <eval_with_key>.15:12:13
  %

In [7]:
from torch.ao.quantization import get_default_qconfig
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import QConfigMapping

conv2d = nn.Conv2d(16, 8, 3)
image = torch.randn(1, 16, 64, 64)

with torch.inference_mode():

    qconfig = get_default_qconfig('fbgemm')
    qconfig_mapping = QConfigMapping().set_global(qconfig)
    example_inputs = torch.randn_like(image)
    prepared_model = prepare_fx(conv2d, qconfig_mapping, example_inputs)
    for _ in range(16):
        prepared_model(torch.randn_like(image))

    quantized_model = convert_fx(prepared_model)

    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
        quantized_model(image)

print(prof.key_averages().table())
print(quantized_model.code)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::quantize_per_tensor         0.23%      60.000us         0.26%      68.000us      68.000us             1  
                       aten::item         0.03%       9.000us         0.04%      11.000us       2.750us             4  
        aten::_local_scalar_dense         0.01%       2.000us         0.01%       2.000us       0.500us             4  
                quantized::conv2d        35.61%       9.285ms        36.00%       9.387ms       9.387ms             1  
                 aten::contiguous         0.01%       2.000us         0.36%      95.000us      95.000us             1  
                      aten::clone       

STAGE:2023-11-20 19:58:43 112085:112085 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-20 19:58:43 112085:112085 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-20 19:58:43 112085:112085 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
