POST TRAINING STATIC QUANTIZATION

In [1]:
import copy
from FaceLandmarkDetection.src.quantization.helper import *

In [7]:
random_seed = 0
num_classes = 136
cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

model_dir = "checkpoints"
model_filename = "resnet18_FLM.pt"
quantized_model_dir = "checkpoints_quantized"
quantized_model_filename = "resnet18_FLM_quantized.pt"
qat_quantized_model_filename = "resnet18_FLM_quantized_qat.pt"
model_filepath = os.path.join(model_dir, model_filename)
quantized_model_filepath = os.path.join(quantized_model_dir, quantized_model_filename)
qat_quantized_model_filepath = os.path.join(quantized_model_dir, qat_quantized_model_filename)
set_random_seeds(random_seed=random_seed)

In [4]:
# Create an untrained model.
model = create_model(num_classes=num_classes)
train_dataset, val_dataset = get_data()
train_loader = make_loader(train_dataset, 64)
val_loader = make_loader(val_dataset, 32)

1111


In [5]:
model = load_model(model=model, model_filepath=model_filepath, device=cpu_device)

In [6]:
# Make a copy of the model for layer fusion
fused_model = copy.deepcopy(model)

model.eval()
# The model has to be switched to evaluation mode before any layer fusion.
# Otherwise the quantization will not work correctly.
fused_model.eval()

# Fuse the model in place rather manually.
fused_model = torch.quantization.fuse_modules(fused_model, [["conv1", "bn1", "relu"]], inplace=True)
for module_name, module in fused_model.named_children():
    if "layer" in module_name:
        for basic_block_name, basic_block in module.named_children():
            torch.quantization.fuse_modules(basic_block, [["conv1", "bn1", "relu1"], ["conv2", "bn2"]], inplace=True)
            for sub_block_name, sub_block in basic_block.named_children():
                if sub_block_name == "downsample":
                    torch.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)

In [None]:
# Print FP32 model.
print(model)
# Print fused model.
print(fused_model)

In [7]:
quantized_model = QuantizedResNet18(model_fp32=fused_model)
quantization_config = torch.quantization.get_default_qconfig("fbgemm")
quantized_model.qconfig = quantization_config

print(quantized_model.qconfig)

torch.quantization.prepare(quantized_model, inplace=True)

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    

In [8]:
quantized_model = QuantizedResNet18(model_fp32=fused_model)
quantization_config = torch.quantization.get_default_qconfig("fbgemm")
quantized_model.qconfig = quantization_config

print(quantized_model.qconfig)

torch.quantization.prepare(quantized_model, inplace=True)

QConfig(activation=functools.partial(<class 'torch.quantization.observer.HistogramObserver'>, reduce_range=True), weight=functools.partial(<class 'torch.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric))


QuantizedResNet18(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver()
  )
  (dequant): DeQuantStub()
  (model_fp32): ResNet(
    (conv1): ConvReLU2d(
      (0): Conv2d(
        1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3)
        (activation_post_process): HistogramObserver()
      )
      (1): ReLU(
        inplace=True
        (activation_post_process): HistogramObserver()
      )
    )
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): ConvReLU2d(
          (0): Conv2d(
            64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
            (activation_post_process): HistogramObserver()
          )
          (1): ReLU(
            inplace=True
            (activation_post_process): HistogramObserver()
          )
        )
        (bn1): Identity()
        (relu1): Identity()
        (conv2):

In [9]:
# Use training data for calibration.
calibrate_model(model=quantized_model, loader=val_loader, device=cpu_device)

In [10]:
quantized_model = torch.quantization.convert(quantized_model, inplace=True)
quantized_model.eval()
# Print quantized model.
print(quantized_model)

QuantizedResNet18(
  (quant): Quantize(scale=tensor([0.0157]), zero_point=tensor([64]), dtype=torch.quint8)
  (dequant): DeQuantize()
  (model_fp32): ResNet(
    (conv1): QuantizedConvReLU2d(1, 64, kernel_size=(7, 7), stride=(2, 2), scale=0.02210974507033825, zero_point=0, padding=(3, 3))
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.016083866357803345, zero_point=0, padding=(1, 1))
        (bn1): Identity()
        (relu1): Identity()
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.054224684834480286, zero_point=64, padding=(1, 1))
        (bn2): Identity()
        (skip_add): QFunctional(
          scale=0.05315326526761055, zero_point=60
          (activation_post_process): Identity()
        )
        (relu2): QuantizedReLU(

In [11]:
# Save quantized model.
save_torchscript_model(model=quantized_model, model_dir=quantized_model_dir, model_filename=quantized_model_filename)
# Load quantized model.
quantized_jit_model = load_torchscript_model(model_filepath=quantized_model_filepath, device=cpu_device)

In [12]:
fp32_eval_loss = evaluate_model(model=model, test_loader=val_loader, device=cpu_device, criterion=nn.MSELoss())
int8_eval_loss = evaluate_model(model=quantized_jit_model, test_loader=val_loader, device=cpu_device, criterion=nn.MSELoss())

In [13]:
print("FP32 evaluation loss: {:.3f}".format(fp32_eval_loss))
print("INT8 evaluation loss: {:.3f}".format(int8_eval_loss))

FP32 evaluation loss: 0.023
INT8 evaluation loss: 0.025


In [14]:
fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(32,1,224,224), num_samples=100)
int8_cpu_inference_latency = measure_inference_latency(model=quantized_model, device=cpu_device, input_size=(32,1,224,224), num_samples=100)
int8_jit_cpu_inference_latency = measure_inference_latency(model=quantized_jit_model, device=cpu_device, input_size=(32,1,224,224), num_samples=100)

In [15]:
print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(int8_cpu_inference_latency * 1000))
print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(int8_jit_cpu_inference_latency * 1000))

FP32 CPU Inference Latency: 1628.89 ms / sample
INT8 CPU Inference Latency: 280.72 ms / sample
INT8 JIT CPU Inference Latency: 316.39 ms / sample


In [16]:
quantized_jit_model = load_torchscript_model(model_filepath=quantized_model_filepath, device=cpu_device)

In [17]:
print_size_of_model(quantized_jit_model)
print_size_of_model(model)

Size (MB): 11.371467
Size (MB): 45.076717


POST TRAINING DYNAMIC QUANTIZATION

In [18]:
# create a quantized model instance
model_int8 = torch.quantization.quantize_dynamic(
    model,  # the original model
    dtype=torch.qint8)  # the target dtype for quantized weights

# run the model
input_fp32 = torch.randn(32, 1, 224, 224)
res = model_int8(input_fp32)

In [19]:
print("Size of model before quantization")
print_size_of_model(model)
print("Size of model after quantization")
print_size_of_model(model_int8)

Size of model before quantization
Size (MB): 45.076717
Size of model after quantization
Size (MB): 44.871211


Dynamic quantization only helps in reducing the model size for models that use Linear and LSTM modules. For the case of resnet18, the model consists of conv layers which do not have dynamic quantization support yet.# Dynamic quantization only helps in reducing the model size for models that use Linear and LSTM modules. For the case of resnet18, the model consists of conv layers which do not have dynamic quantization support yet.

QUANTIZATION AWARE TRAINING

In [15]:
model = create_model(num_classes=136)
model = load_model(model=model, model_filepath=model_filepath, device=cpu_device)
model.to(cpu_device)
# Make a copy of the model for layer fusion
fused_model = copy.deepcopy(model)

model.train()
# The model has to be switched to training mode before any layer fusion.
# Otherwise the quantization aware training will not work correctly.
fused_model.train()

fused_model = torch.quantization.fuse_modules(model, [["conv1", "bn1", "relu"]], inplace=True)
for module_name, module in fused_model.named_children():
    if "layer" in module_name:
        for basic_block_name, basic_block in module.named_children():
            torch.quantization.fuse_modules(basic_block, [["conv1", "bn1", "relu1"], ["conv2", "bn2"]], inplace=True)
            for sub_block_name, sub_block in basic_block.named_children():
                if sub_block_name == "downsample":
                    torch.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)


print(model)
print(fused_model)

model.eval()
fused_model.eval()                  

fused_model_w_quant = QuantizedResNet18(model_fp32=fused_model)
fused_model_w_quant.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
model_fp32_prepared = torch.quantization.prepare_qat(fused_model_w_quant)

# Train model.
model_fp32_prepared = train_model(model=model_fp32_prepared, train_loader=train_loader, test_loader=val_loader, device=cpu_device, num_epochs=10)

ResNet(
  (conv1): ConvBnReLU2d(
    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (bn1): Identity()
  (relu): Identity()
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): ConvBnReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (bn1): Identity()
      (relu1): Identity()
      (conv2): ConvBn2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (bn2): Identity()
      (skip_add): FloatFunctional(
       

In [16]:
quantized_model = torch.quantization.convert(model_fp32_prepared.eval(), inplace=False)
quantized_model.eval()

QuantizedResNet18(
  (quant): Quantize(scale=tensor([0.0157]), zero_point=tensor([64]), dtype=torch.quint8)
  (dequant): DeQuantize()
  (model_fp32): ResNet(
    (conv1): QuantizedConvReLU2d(1, 64, kernel_size=(7, 7), stride=(2, 2), scale=0.03440048545598984, zero_point=0, padding=(3, 3))
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.028485286980867386, zero_point=0, padding=(1, 1))
        (bn1): Identity()
        (relu1): Identity()
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.07750637829303741, zero_point=76, padding=(1, 1))
        (bn2): Identity()
        (skip_add): QFunctional(
          scale=0.08231262117624283, zero_point=62
          (activation_post_process): Identity()
        )
        (relu2): QuantizedReLU(i

In [17]:
print(quantized_model)

QuantizedResNet18(
  (quant): Quantize(scale=tensor([0.0157]), zero_point=tensor([64]), dtype=torch.quint8)
  (dequant): DeQuantize()
  (model_fp32): ResNet(
    (conv1): QuantizedConvReLU2d(1, 64, kernel_size=(7, 7), stride=(2, 2), scale=0.03440048545598984, zero_point=0, padding=(3, 3))
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.028485286980867386, zero_point=0, padding=(1, 1))
        (bn1): Identity()
        (relu1): Identity()
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.07750637829303741, zero_point=76, padding=(1, 1))
        (bn2): Identity()
        (skip_add): QFunctional(
          scale=0.08231262117624283, zero_point=62
          (activation_post_process): Identity()
        )
        (relu2): QuantizedReLU(i

In [11]:
# Save quantized model.
save_torchscript_model(model=quantized_model, model_dir=quantized_model_dir, model_filename=qat_quantized_model_filename)
# Load quantized model.
quantized_jit_model = load_torchscript_model(model_filepath=qat_quantized_model_filepath, device=cpu_device)

In [13]:
int8_eval_accuracy = evaluate_model(model=quantized_jit_model, test_loader=val_loader, device=cpu_device, criterion=nn.MSELoss())
print("INT8 evaluation accuracy: {:.3f}".format(int8_eval_accuracy))

INT8 evaluation accuracy: 0.505


In [14]:
fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(32,1,224,224), num_samples=100)
int8_cpu_inference_latency = measure_inference_latency(model=quantized_model, device=cpu_device, input_size=(32,1,224,224), num_samples=100)
int8_jit_cpu_inference_latency = measure_inference_latency(model=quantized_jit_model, device=cpu_device, input_size=(32,1,224,224), num_samples=100)
print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(int8_cpu_inference_latency * 1000))
print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(int8_jit_cpu_inference_latency * 1000))

FP32 CPU Inference Latency: 1625.51 ms / sample
INT8 CPU Inference Latency: 306.87 ms / sample
INT8 JIT CPU Inference Latency: 334.12 ms / sample
