#Load required Libraries

In [None]:
!pip install -q -U transformers peft accelerate optimum

In [None]:
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/

#Quantize transformers model using auto-gptq

 Quantize a model by passing a supported dataset:

 Quantizing a model using auto-gptq, we need to pass a dataset to the quantizer. This can be achieved either by passing a supported default dataset among ['wikitext2','c4','c4-new','ptb','ptb-new'] or a list of strings that will be used as a dataset

 Supported precisions are [2, 4, 6, 8]



In [None]:
from transformers import (
    AutoModelForCausalLM, # AutoModelForCausalLM class for loading pre-trained language models.
    AutoTokenizer,        # AutoTokenizer class for tokenizing text input.
    GPTQConfig            # GPTQConfig class for configuring model quantization.
)
import torch              # PyTorch library for working with tensors and neural networks.

# Specify the ID of the pre-trained language model to be used:

model_id = "facebook/opt-125m"

quantization_config = GPTQConfig(
     bits=4,          # Quantize model weights to 4 bits for reduced model size and faster inference.
     group_size=128,  # Set the group size for quantization, which controls the granularity of quantization.
     dataset="ptb-new",  # Specify the dataset used for quantization calibration, which helps ensure model accuracy after quantization
     desc_act=False,    # Disable the use of descriptor activations, a technique for improving accuracy in quantized models.
)

# Load the tokenizer:
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the quantized model:
quant_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map='auto' # Automatically distribute the model across available devices (if applicable).
)


Downloading data:   0%|          | 0.00/2.96M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/262k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/236k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42068 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3761 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3370 [00:00<?, ? examples/s]

Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Checking the attributes of the linear layers.

should contain qweight and qzeros attributes that should be in torch.int32 dtype.

In [None]:
quant_model.model.decoder.layers[0].self_attn.q_proj.__dict__

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict([('qweight',
               tensor([[ 1711760090, -1248295259, -2025411892,  ..., -1486452502,
                         2019142072, -1735820810],
                       [-2000132747,  -578262345,  1484081337,  ..., -1230600537,
                        -2019252040, -2023311003],
                       [ -710293850, -1153090188,  1431922298,  ..., -1768449094,
                         2042194587, -2004125258],
                       ...,
                       [-1183500120, -1493527382, -1771730232,  ..., -1518687826,
                         -411714696,  -393894231],
                       [-1722191245,  1217685577,  1737246908,  ...,  1471776933,
                        -1732560250, -1754854792],
                       [ 2015980135,  1771801461,  2006620761,  ..., -1211525767,
                        -1718835608, -1756087700]], device='cuda:0', dtype=torch.int32)),
              ('qzeros',
               tensor(

 Inference on the quantized model

In [None]:
# Load the tokenizer associated with the specified model:
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Prepare the input text:
text = "Hello my name is"

# Tokenize the input text:
inputs = tokenizer(text,
                   return_tensors="pt" # Return PyTorch tensors for model compatibility
                   ).to(0) # Move the tensors to the specified device (GPU 0 in this case)

# Generate text using the quantized model:
out = quant_model.generate(**inputs)  # Unpack inputs as keyword arguments for the model.

# Decode the generated text:

print(tokenizer.decode(
    out[0] # Select the first generated sequence
     , skip_special_tokens=True # Remove special tokens for readability
    ))



Hello my name is James and I am a student at the University of California, San Diego.


#Share quantized models on Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()  # Trigger the login process, prompting for Hugging Face credentials.

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Push the quantized model to the Hugging Face Hub under the repository "opt-125m-gptq-4bit-54":
quant_model.push_to_hub("opt-125m-gptq-4bit-54")

# Push the associated tokenizer to the same repository:
tokenizer.push_to_hub("opt-125m-gptq-4bit-54")

model.safetensors:   0%|          | 0.00/125M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/begangowsik/opt-125m-gptq-4bit-54/commit/684c0c0b6dd72caf81ee9014288dfdcbd9ff4da9', commit_message='Upload tokenizer', commit_description='', oid='684c0c0b6dd72caf81ee9014288dfdcbd9ff4da9', pr_url=None, pr_revision=None, pr_num=None)

You can Load quantized models from the Hub