In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torchvision
from Vision_transformer import VisionTransformer, VisionTransformerForPTQ, CustomDataset
import torchvision.transforms as transforms
from pprint import pprint
from torchsummary import summary
import json
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
torch.ao.quantization.quantize_dynamic

<function torch.ao.quantization.quantize.quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, mapping=None, inplace=False)>

In [2]:

device = torch.device("cpu")
# We don't want to perform our quantization step on cuda GPU. It is not supported.
with open('config.json') as f:
    custom_config = json.load(f)
# Custom configurations for the VisionTransformer.
# Transformer can be customized with these configurations.
# Refer to documentation of the class VisionTransformer
# (`VisionTransformer.__doc__`, use pprint for cleaner display)
# for exact details of the customization.


In [3]:
# Load saved model
MNIST_ViT = VisionTransformer(**custom_config).to(device=device)
checkpoint = torch.load("model.pth")
MNIST_ViT.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [4]:
transform = transforms.Compose([
    transforms.ToTensor(),
])   # Transform object to apply on the dataset.

# train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
# Loading/Downloading dataset. `download` can be `False` if the data is present in the root directory
# Else it will download the dataset to to the root location.

test_ds = CustomDataset(data=test_dataset, device=device)
# Made custom dataset objects from the MNIST dataset.

test_loader = torch.utils.data.DataLoader(test_ds, batch_size=64, shuffle=False)
# DataLoaders for fast implementation of loading batch-wise data.



In [5]:
def test(model : VisionTransformer):
    correct, total = 0, 0
    model.eval()
    # Setting the model in evaluation mode.
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            # Loading batch images and ground truth onto device
            outputs = model(images)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
    return f"Accuracy on test set: {(100 * correct / total):.2f}%"
            


In [6]:
test(MNIST_ViT)

'Accuracy on test set: 49.74%'

In [7]:
# Weights matrix of the model before quantization
print('Weights before quantization')
print(MNIST_ViT.head.weight)
print(MNIST_ViT.head.weight.dtype)

Weights before quantization
Parameter containing:
tensor([[-7.0662e-02,  1.3499e-01, -3.8009e-01,  6.0584e-01,  4.1917e-01,
         -3.7075e-01,  2.0450e-01, -4.0412e-01,  6.2776e-02, -4.6475e-01,
         -5.9376e-01, -2.4132e-01, -4.2393e-01,  1.8389e-01,  1.8460e-01,
          2.1956e-01, -3.1886e-01, -3.3674e-01,  8.8356e-02, -4.2352e-01,
         -4.0939e-02, -1.0542e-01,  1.8793e-01, -7.2534e-02,  1.6244e-01,
         -1.2656e-01,  3.3465e-01,  8.5337e-02,  5.2124e-01, -3.7563e-01,
         -4.5943e-01,  1.7528e-01],
        [ 3.0352e-02, -5.3761e-01,  5.1218e-01,  5.0710e-01, -1.9978e-01,
          1.8169e-01,  2.8051e-01, -2.8327e-04,  1.3403e-01,  3.0515e-01,
         -1.6733e-01, -1.4994e-01, -2.9695e-01, -4.3653e-02, -2.0136e-01,
         -3.6033e-01,  3.7734e-01,  7.5984e-01, -3.7332e-01,  2.5667e-01,
         -2.6232e-01, -1.7522e-01, -3.5150e-01, -4.4539e-02,  3.4139e-02,
          2.3827e-01,  3.2483e-01, -4.8200e-01, -3.3107e-01,  6.6220e-04,
         -1.3452e-01, -4.9

In [8]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp_delme.p")
    print('Size (KB):', os.path.getsize("temp_delme.p")/1e3)
    os.remove('temp_delme.p')

print('Size of the model before quantization')
print_size_of_model(MNIST_ViT)

Size of the model before quantization
Size (KB): 63.01


In [9]:
print(f'Accuracy of the model before quantization: ')
test(MNIST_ViT)

Accuracy of the model before quantization: 


'Accuracy on test set: 49.74%'

In [10]:
# Loading weights to the object that we have to quantize
net_quantized = VisionTransformerForPTQ(**custom_config).to(device=device)
net_quantized.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [11]:
net_quantized.eval()
max_bit_length = 4
# net_quantized.qconfig = torch.ao.quantization.default_qconfig

net_quantized.qconfig = torch.quantization.QConfig(
    activation=torch.quantization.fake_quantize.FakeQuantize.with_args(observer = torch.quantization.observer.MovingAverageMinMaxObserver.with_args(dtype=torch.quint8), quant_min = 0 ,quant_max=2**(max_bit_length)-1, dtype=torch.quint8), 
    weight=torch.quantization.fake_quantize.FakeQuantize.with_args(observer = torch.quantization.observer.MovingAverageMinMaxObserver.with_args(dtype=torch.qint8), quant_min = 0 ,quant_max=2**(max_bit_length)-1, dtype=torch.qint8)
)

# net_quantized.qconfig = torch.ao.quantization.QConfig(
#     activation=torch.ao.quantization.fake_quantize.FakeQuantize.with_args(observer = torch.ao.quantization.observer.MovingAverageMinMaxObserver.with_args(dtype=torch.quint8), quant_min =-2**(max_bit_length-1) ,quant_max=2**(max_bit_length-1)-1, dtype=torch.quint8), 
#     weight=torch.ao.quantization.fake_quantize.FakeQuantize.with_args(observer = torch.ao.quantization.observer.MovingAverageMinMaxObserver.with_args(dtype=torch.quint8), quant_min =-2**(max_bit_length-1) ,quant_max=2**(max_bit_length-1)-1, dtype=torch.quint8)
# )

In [12]:
net_quantized.qconfig

QConfig(activation=functools.partial(<class 'torch.ao.quantization.fake_quantize.FakeQuantize'>, observer=functools.partial(<class 'torch.ao.quantization.observer.MovingAverageMinMaxObserver'>, dtype=torch.quint8){}, quant_min=0, quant_max=15, dtype=torch.quint8){}, weight=functools.partial(<class 'torch.ao.quantization.fake_quantize.FakeQuantize'>, observer=functools.partial(<class 'torch.ao.quantization.observer.MovingAverageMinMaxObserver'>, dtype=torch.qint8){}, quant_min=0, quant_max=15, dtype=torch.qint8){})

In [13]:
# torch.ao.quantization.QConfig(
#     activation=torch.ao.quantization.fake_quantize.FakeQuantize.with_args(quant_min =-2**(max_bit_length-1) ,quant_max=2**(max_bit_length-1)-1, dtype=torch.qint8), 
#     weight=torch.ao.quantization.fake_quantize.FakeQuantize.with_args(quant_min =-2**(max_bit_length-1) ,quant_max=2**(max_bit_length-1)-1, dtype=torch.qint8)
# )

In [14]:
net_quantized = torch.ao.quantization.prepare(net_quantized) # Insert observers
net_quantized

VisionTransformerForPTQ(
  (quant): QuantStub(
    (activation_post_process): FakeQuantize(
      fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=15, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([1.]), zero_point=tensor([0], dtype=torch.int32)
      (activation_post_process): MovingAverageMinMaxObserver(min_val=inf, max_val=-inf)
    )
  )
  (patch_embed): PatchEmbed(
    (proj): Conv2d(
      1, 32, kernel_size=(4, 4), stride=(4, 4)
      (activation_post_process): FakeQuantize(
        fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=15, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([1.]), zero_point=tensor([0], dtype=torch.int32)
        (activation_post_process): MovingAverageMinMaxObserver(min_val=inf, max_val=-inf)
      )
    )
  )
  (pos_drop): Dropout(p=0.2, inplace=

In [15]:
test(net_quantized)

'Accuracy on test set: 15.72%'

In [16]:
print(f'Check statistics of the various layers')
net_quantized

Check statistics of the various layers


VisionTransformerForPTQ(
  (quant): QuantStub(
    (activation_post_process): FakeQuantize(
      fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=15, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([0.0840]), zero_point=tensor([6], dtype=torch.int32)
      (activation_post_process): MovingAverageMinMaxObserver(min_val=-0.5092379450798035, max_val=0.750572919845581)
    )
  )
  (patch_embed): PatchEmbed(
    (proj): Conv2d(
      1, 32, kernel_size=(4, 4), stride=(4, 4)
      (activation_post_process): FakeQuantize(
        fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=15, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([0.2216]), zero_point=tensor([7], dtype=torch.int32)
        (activation_post_process): MovingAverageMinMaxObserver(min_val=-1.6397794485092163, max_val=1.684331

In [17]:
net_quantized.to(device)

VisionTransformerForPTQ(
  (quant): QuantStub(
    (activation_post_process): FakeQuantize(
      fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=15, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([0.0840]), zero_point=tensor([6], dtype=torch.int32)
      (activation_post_process): MovingAverageMinMaxObserver(min_val=-0.5092379450798035, max_val=0.750572919845581)
    )
  )
  (patch_embed): PatchEmbed(
    (proj): Conv2d(
      1, 32, kernel_size=(4, 4), stride=(4, 4)
      (activation_post_process): FakeQuantize(
        fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=15, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([0.2216]), zero_point=tensor([7], dtype=torch.int32)
        (activation_post_process): MovingAverageMinMaxObserver(min_val=-1.6397794485092163, max_val=1.684331

In [18]:
net_quantized = torch.quantization.convert(net_quantized)



In [19]:
print(f'Check statistics of the various layers')
net_quantized

Check statistics of the various layers


VisionTransformerForPTQ(
  (quant): Quantize(scale=tensor([0.0840]), zero_point=tensor([6]), dtype=torch.quint8)
  (patch_embed): PatchEmbed(
    (proj): QuantizedConv2d(1, 32, kernel_size=(4, 4), stride=(4, 4), scale=0.22160738706588745, zero_point=7)
  )
  (pos_drop): QuantizedDropout(p=0.2, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): QuantizedLayerNorm((32,), eps=1e-07, elementwise_affine=True)
      (attn): Attention(
        (qkv): QuantizedLinear(in_features=32, out_features=96, scale=0.6339914798736572, zero_point=8, qscheme=torch.per_tensor_affine)
        (attn_drop): QuantizedDropout(p=0.2, inplace=False)
        (proj): QuantizedLinear(in_features=32, out_features=32, scale=0.4078219532966614, zero_point=7, qscheme=torch.per_tensor_affine)
        (proj_drop): QuantizedDropout(p=0.2, inplace=False)
        (operation): QFunctional(
          scale=3.126173496246338, zero_point=7
          (activation_post_process): Identity()
        )
        (sm):

In [20]:
# Print the weights matrix of the model after quantization
print('Weights after quantization')
print(net_quantized.head)

Weights after quantization
QuantizedLinear(in_features=32, out_features=10, scale=1.3499046564102173, zero_point=8, qscheme=torch.per_tensor_affine)


In [21]:
print('Size of the model after quantization')
print_size_of_model(net_quantized)

Size of the model after quantization
Size (KB): 44.77


In [22]:
print('Testing the model after quantization')
test(net_quantized)

Testing the model after quantization


NotImplementedError: Could not run 'aten::_softmax.out' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_softmax.out' is only available for these backends: [CPU, CUDA, Meta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

CPU: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\build\aten\src\ATen\RegisterCPU.cpp:31357 [kernel]
CUDA: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\build\aten\src\ATen\RegisterCUDA.cpp:44411 [kernel]
Meta: registered at /dev/null:228 [kernel]
BackendSelect: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\core\BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:154 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\functorch\DynamicLayer.cpp:498 [backend fallback]
Functionalize: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\build\aten\src\ATen\RegisterFunctionalization_1.cpp:25069 [kernel]
Named: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\core\NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\NegateFallback.cpp:19 [backend fallback]
ZeroTensor: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\ADInplaceOrViewType_1.cpp:5216 [kernel]
AutogradOther: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradCPU: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradCUDA: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradHIP: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradXLA: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradMPS: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradIPU: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradXPU: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradHPU: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradVE: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradLazy: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradMTIA: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradPrivateUse1: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradPrivateUse2: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradPrivateUse3: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradMeta: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
AutogradNestedTensor: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\VariableType_2.cpp:19039 [autograd kernel]
Tracer: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\generated\TraceType_3.cpp:14672 [kernel]
AutocastCPU: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\autocast_mode.cpp:378 [backend fallback]
AutocastCUDA: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\autocast_mode.cpp:244 [backend fallback]
FuncTorchBatched: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\functorch\LegacyBatchingRegistrations.cpp:720 [backend fallback]
BatchedNestedTensor: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\functorch\LegacyBatchingRegistrations.cpp:746 [backend fallback]
FuncTorchVmapMode: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\functorch\VmapModeRegistrations.cpp:28 [backend fallback]
Batched: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\functorch\TensorWrapper.cpp:203 [backend fallback]
PythonTLSSnapshot: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:162 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\functorch\DynamicLayer.cpp:494 [backend fallback]
PreDispatch: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:166 [backend fallback]
PythonDispatcher: registered at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:158 [backend fallback]


In [None]:
torch.__version__

'2.2.1+cu121'

In [None]:
for i, (name, param) in enumerate(MNIST_ViT.named_parameters()):
    print(i, name,param.shape)

0 cls_token torch.Size([1, 1, 32])
1 pos_embed torch.Size([1, 50, 32])
2 patch_embed.proj.weight torch.Size([32, 1, 4, 4])
3 patch_embed.proj.bias torch.Size([32])
4 blocks.0.norm1.weight torch.Size([32])
5 blocks.0.norm1.bias torch.Size([32])
6 blocks.0.attn.qkv.weight torch.Size([96, 32])
7 blocks.0.attn.qkv.bias torch.Size([96])
8 blocks.0.attn.proj.weight torch.Size([32, 32])
9 blocks.0.attn.proj.bias torch.Size([32])
10 blocks.0.norm2.weight torch.Size([32])
11 blocks.0.norm2.bias torch.Size([32])
12 blocks.0.mlp.fc1.weight torch.Size([12, 32])
13 blocks.0.mlp.fc1.bias torch.Size([12])
14 blocks.0.mlp.fc2.weight torch.Size([32, 12])
15 blocks.0.mlp.fc2.bias torch.Size([32])
16 blocks.1.norm1.weight torch.Size([32])
17 blocks.1.norm1.bias torch.Size([32])
18 blocks.1.attn.qkv.weight torch.Size([96, 32])
19 blocks.1.attn.qkv.bias torch.Size([96])
20 blocks.1.attn.proj.weight torch.Size([32, 32])
21 blocks.1.attn.proj.bias torch.Size([32])
22 blocks.1.norm2.weight torch.Size([32])
23

In [None]:
for i, (name, param) in enumerate(net_quantized.named_parameters()):
    print(i, name,param.shape)

0 cls_token torch.Size([1, 1, 32])
1 pos_embed torch.Size([1, 50, 32])
2 blocks.0.norm1.weight torch.Size([32])
3 blocks.0.norm1.bias torch.Size([32])
4 blocks.0.norm2.weight torch.Size([32])
5 blocks.0.norm2.bias torch.Size([32])
6 blocks.1.norm1.weight torch.Size([32])
7 blocks.1.norm1.bias torch.Size([32])
8 blocks.1.norm2.weight torch.Size([32])
9 blocks.1.norm2.bias torch.Size([32])
10 norm.weight torch.Size([32])
11 norm.bias torch.Size([32])
