In [2]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load pre-trained model and tokenizer (here, GPT-2 for example)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Test the model before quantization
text = "Once upon a time,"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=50)
print("Original Model Output:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Apply dynamic quantization to the model
quantized_model = torch.quantization.quantize_dynamic(
    model,  # the model to quantize
    {torch.nn.Linear},  # layers to quantize (e.g., only Linear layers in transformers)
    dtype=torch.qint8  # dtype for quantization (int8)
)

# Test the model after quantization
outputs_quantized = quantized_model.generate(**inputs, max_new_tokens=50)
print("\nQuantized Model Output:")
print(tokenizer.decode(outputs_quantized[0], skip_special_tokens=True))

# Compare model sizes
def print_size_of_model(model, model_name=""):
    torch.save(model.state_dict(), f"{model_name}.pt")
    print(f"Model size of {model_name}: {os.path.getsize(f'{model_name}.pt') / 1e6} MB")

print_size_of_model(model, "Original_Model")
print_size_of_model(quantized_model, "Quantized_Model")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original Model Output:
Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Quantized Model Output:
Once upon a time, the "a" and "b" in the "-" and "-" in the "-" and "-" in the "-" and, behold, the "-" and "-" in the "-" and "
Model size of Original_Model: 497.813618 MB
Model size of Quantized_Model: 536.412188 MB


## Dynamic quantization

In [3]:
import os  # Fix: Import os to use for file size checking
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load pre-trained model and tokenizer (e.g., GPT-2)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Test the model before quantization
text = "Once upon a time,"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print("Original Model Output:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Apply dynamic quantization to the model
quantized_model = torch.quantization.quantize_dynamic(
    model,  # the model to quantize
    {torch.nn.Linear},  # layers to quantize (focusing on Linear layers)
    dtype=torch.qint8  # dtype for quantization (int8)
)

# Test the quantized model's output
outputs_quantized = quantized_model.generate(**inputs, max_new_tokens=10)
print("\nQuantized Model Output:")
print(tokenizer.decode(outputs_quantized[0], skip_special_tokens=True))

# Function to print and compare model sizes
def print_size_of_model(model, model_name=""):
    torch.save(model.state_dict(), f"{model_name}.pt")
    size_mb = os.path.getsize(f'{model_name}.pt') / 1e6
    print(f"Model size of {model_name}: {size_mb:.2f} MB")

# Check the sizes of the original and quantized models
print_size_of_model(model, "Original_Model")
print_size_of_model(quantized_model, "Quantized_Model")

# Clean up saved files after checking size
os.remove("Original_Model.pt")
os.remove("Quantized_Model.pt")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original Model Output:
Once upon a time, the world was a place of great beauty and great


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Quantized Model Output:
Once upon a time, the "a" and "b" in the
Model size of Original_Model: 497.81 MB
Model size of Quantized_Model: 536.41 MB


## Static quantization

In [6]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load pre-trained model and tokenizer (e.g., GPT-2)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# Define a calibration function to run the model with sample inputs
def calibrate_model(model, tokenizer):
    sample_texts = [
        "Once upon a time,",
        "The quick brown fox",
        "In a galaxy far far away",
        "Machine learning is fascinating"
    ]
    for text in sample_texts:
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            model.generate(**inputs, max_new_tokens=10)

# Specify quantization configuration
# Use float_qparams_weight_only_qconfig for the embedding layers
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Embedding):
        module.qconfig = torch.quantization.float_qparams_weight_only_qconfig

# Use the default QConfig for the rest of the model
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')

# Prepare the model for static quantization
model_prepared = torch.quantization.prepare(model)

# Calibrate the model with representative data
calibrate_model(model_prepared, tokenizer)

# Convert to quantized model
model_quantized = torch.quantization.convert(model_prepared)

# Test the quantized model's output
text = "Once upon a time,"
inputs = tokenizer(text, return_tensors="pt")
outputs_quantized = model_quantized.generate(**inputs, max_new_tokens=10)
print("\nQuantized Model Output:")
print(tokenizer.decode(outputs_quantized[0], skip_special_tokens=True))

# Function to print and compare model sizes
def print_size_of_model(model, model_name=""):
    torch.save(model.state_dict(), f"{model_name}.pt")
    size_mb = os.path.getsize(f'{model_name}.pt') / 1e6
    print(f"Model size of {model_name}: {size_mb:.2f} MB")

# Compare sizes of original and quantized models
print_size_of_model(model, "Original_Model")
print_size_of_model(model_quantized, "Quantized_Model")

# Clean up saved files after checking size
os.remove("Original_Model.pt")
os.remove("Quantized_Model.pt")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


NotImplementedError: Could not run 'quantized::layer_norm' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::layer_norm' is only available for these backends: [Meta, QuantizedCPU, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMeta, Tracer, AutocastCPU, AutocastXPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

Meta: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\MetaFallbackKernel.cpp:23 [backend fallback]
QuantizedCPU: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\quantized\cpu\qnormalization.cpp:133 [kernel]
BackendSelect: fallthrough registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\PythonFallbackKernel.cpp:153 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\functorch\DynamicLayer.cpp:497 [backend fallback]
Functionalize: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\FunctionalizeFallbackKernel.cpp:349 [backend fallback]
Named: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:86 [backend fallback]
AutogradOther: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:53 [backend fallback]
AutogradCPU: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:57 [backend fallback]
AutogradCUDA: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:65 [backend fallback]
AutogradXLA: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:69 [backend fallback]
AutogradMPS: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:77 [backend fallback]
AutogradXPU: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:61 [backend fallback]
AutogradHPU: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:90 [backend fallback]
AutogradLazy: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:73 [backend fallback]
AutogradMeta: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\VariableFallbackKernel.cpp:81 [backend fallback]
Tracer: registered at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\TraceTypeManual.cpp:297 [backend fallback]
AutocastCPU: fallthrough registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\autocast_mode.cpp:209 [backend fallback]
AutocastXPU: fallthrough registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\autocast_mode.cpp:351 [backend fallback]
AutocastCUDA: fallthrough registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\functorch\LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\functorch\LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\functorch\VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\functorch\TensorWrapper.cpp:207 [backend fallback]
PythonTLSSnapshot: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\PythonFallbackKernel.cpp:161 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\functorch\DynamicLayer.cpp:493 [backend fallback]
PreDispatch: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\PythonFallbackKernel.cpp:165 [backend fallback]
PythonDispatcher: registered at C:\cb\pytorch_1000000000000\work\aten\src\ATen\core\PythonFallbackKernel.cpp:157 [backend fallback]
