# Linear quantization to int8 

- [Original Source: Deeplearning.ai Quantization Fundamentals Course](https://learn.deeplearning.ai/courses/quantization-fundamentals/lesson/5/quantization-theory)

In [15]:
#!pip install transformers quanto==0.0.11 torch

In [16]:
import torch
from quanto import quantize, freeze
from transformers import AutoModelForCausalLM, AutoTokenizer

In [17]:
def named_module_tensors(module, recurse=False):
    for named_parameter in module.named_parameters(recurse=recurse):
      name, val = named_parameter
      flag = True
      if hasattr(val,"_data") or hasattr(val,"_scale"):
        if hasattr(val,"_data"):
          yield name + "._data", val._data
        if hasattr(val,"_scale"):
          yield name + "._scale", val._scale
      else:
        yield named_parameter

    for named_buffer in module.named_buffers(recurse=recurse):
      yield named_buffer

def dtype_byte_size(dtype):
    """
    Returns the size (in bytes) occupied by one parameter of type `dtype`.
    """
    import re
    if dtype == torch.bool:
        return 1 / 8
    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
    if bit_search is None:
        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
    bit_size = int(bit_search.groups()[0])
    return bit_size // 8


def compute_module_sizes(model):
    """
    Compute the size of each submodule of a given model.
    """
    from collections import defaultdict
    module_sizes = defaultdict(int)
    for name, tensor in named_module_tensors(model, recurse=True):
      size = tensor.numel() * dtype_byte_size(tensor.dtype)
      name_parts = name.split(".")
      for idx in range(len(name_parts) + 1):
        module_sizes[".".join(name_parts[:idx])] += size

    return module_sizes

In [18]:
model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")
print(model.gpt_neox)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The model size is 1.6402112960000002 GB
GPTNeoXModel(
  (embed_in): Embedding(50304, 1024)
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-23): 24 x GPTNeoXLayer(
      (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_dropout): Dropout(p=0.0, inplace=False)
      (post_mlp_dropout): Dropout(p=0.0, inplace=False)
      (attention): GPTNeoXAttention(
        (rotary_emb): GPTNeoXRotaryEmbedding()
        (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
        (dense): Linear(in_features=1024, out_features=1024, bias=True)
        (attention_dropout): Dropout(p=0.0, inplace=False)
      )
      (mlp): GPTNeoXMLP(
        (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
        (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        (act): GELUActivation(

In [19]:
input_text = "Today my plans are "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Today my plans are 
to go to the beach with my family and watch the sun set over


In [20]:
quantize(model, weights=torch.int8, activations=None)
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")
print(model.gpt_neox)
print(model.gpt_neox.layers[0].attention.dense.weight)

The model size is 1.6402120720000002 GB
GPTNeoXModel(
  (embed_in): Embedding(50304, 1024)
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-23): 24 x GPTNeoXLayer(
      (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_dropout): Dropout(p=0.0, inplace=False)
      (post_mlp_dropout): Dropout(p=0.0, inplace=False)
      (attention): GPTNeoXAttention(
        (rotary_emb): GPTNeoXRotaryEmbedding()
        (query_key_value): QLinear(in_features=1024, out_features=3072, bias=True)
        (dense): QLinear(in_features=1024, out_features=1024, bias=True)
        (attention_dropout): Dropout(p=0.0, inplace=False)
      )
      (mlp): GPTNeoXMLP(
        (dense_h_to_4h): QLinear(in_features=1024, out_features=4096, bias=True)
        (dense_4h_to_h): QLinear(in_features=4096, out_features=1024, bias=True)
        (act): GELUActivat

In [21]:
freeze(model)
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")
print(model.gpt_neox)
print(model.gpt_neox.layers[0].attention.dense.weight)

The model size is 0.580794472 GB
GPTNeoXModel(
  (embed_in): Embedding(50304, 1024)
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-23): 24 x GPTNeoXLayer(
      (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_dropout): Dropout(p=0.0, inplace=False)
      (post_mlp_dropout): Dropout(p=0.0, inplace=False)
      (attention): GPTNeoXAttention(
        (rotary_emb): GPTNeoXRotaryEmbedding()
        (query_key_value): QLinear(in_features=1024, out_features=3072, bias=True)
        (dense): QLinear(in_features=1024, out_features=1024, bias=True)
        (attention_dropout): Dropout(p=0.0, inplace=False)
      )
      (mlp): GPTNeoXMLP(
        (dense_h_to_4h): QLinear(in_features=1024, out_features=4096, bias=True)
        (dense_4h_to_h): QLinear(in_features=4096, out_features=1024, bias=True)
        (act): GELUActivation()
 

In [22]:
input_text = "Today my plans are "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Today my plans are 
to go to the beach with my family and watch the sun set over
