## Preparing a Custom Model for Quantization

In [1]:
!git clone https://github.com/aashu-0/FineTuning_GPT2.git
%cd FineTuning_GPT2

Cloning into 'FineTuning_GPT2'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 277 (delta 4), reused 13 (delta 4), pack-reused 264 (from 1)[K
Receiving objects: 100% (277/277), 1.02 MiB | 8.73 MiB/s, done.
Resolving deltas: 100% (153/153), done.
/kaggle/working/FineTuning_GPT2


In [2]:
# cwd
import os
print(os.getcwd())

/kaggle/working/FineTuning_GPT2


In [3]:
!pip -q install tiktoken

In [4]:
from base_model.config import GPT2Config
config = GPT2Config()

In [5]:
import torch
import torch.nn as nn
from torch.quantization import QuantStub, DeQuantStub
import torch.nn.functional as F
import math
from quantization.gpt2_quantization_prep import prepare_model_for_quantization

In [7]:
gpt2 = prepare_model_for_quantization(
    config = config,
    lora_rank = 16,
    lora_alpha = 16,
    model_path = "/kaggle/input/gpt2/pytorch/lora-finetuned/1/gpt2_lorafinetuned.pt"
)

Initializing GPT2 model with quantization stubs...
Initial total parameters: 163,037,184
Initial trainable parameters: 163,037,184

Freezing base model parameters...
Trainable parameters after freezing: 0

Injecting LoRA layers (rank=16, alpha=16)...
Trainable LoRA parameters: 3,175,696

Loading pre-trained weights from: /kaggle/input/gpt2/pytorch/lora-finetuned/1/gpt2_lorafinetuned.pt
Successfully loaded pre-trained weights!

Model preparation complete!
The model is now ready for post-training quantization.


In [8]:
gpt2

GPTModelQuantized(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (emb_dropout): Dropout(p=0.1, inplace=False)
  (main_quant): QuantStub()
  (main_dequant): DeQuantStub()
  (trf_blocks): ModuleList(
    (0-11): 12 x TransformerBlock(
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): CausalMultiHeadAttention(
        (qkv): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=2304, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (res_dropout): Dropout(p=0.1, inplace=False)
        (input_quant): QuantStub()
        (input_dequant): DeQuantStub()
        (output_quant): QuantStub()
        (output_dequant): DeQuantStub()
      )
      (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (quant): QuantStub()
        (dequant

In [9]:
gpt2.eval()

GPTModelQuantized(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (emb_dropout): Dropout(p=0.1, inplace=False)
  (main_quant): QuantStub()
  (main_dequant): DeQuantStub()
  (trf_blocks): ModuleList(
    (0-11): 12 x TransformerBlock(
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): CausalMultiHeadAttention(
        (qkv): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=2304, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (res_dropout): Dropout(p=0.1, inplace=False)
        (input_quant): QuantStub()
        (input_dequant): DeQuantStub()
        (output_quant): QuantStub()
        (output_dequant): DeQuantStub()
      )
      (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (quant): QuantStub()
        (dequant

In [10]:
print('Before Quantization')
print(gpt2.trf_blocks[0].attn.qkv.linear.weight)
print(gpt2.trf_blocks[0].attn.qkv.linear.weight.dtype)

Before Quantization
Parameter containing:
tensor([[-0.4738,  0.0874,  0.0039,  ..., -0.2592,  0.1517, -0.4100],
        [-0.2614,  0.1473,  0.0695,  ..., -0.0164,  0.2170, -0.1924],
        [-0.0978,  0.2387,  0.3668,  ...,  0.1991,  0.1043, -0.2400],
        ...,
        [ 0.0513, -0.0525,  0.1143,  ...,  0.0095,  0.0293, -0.0046],
        [-0.0584, -0.0113,  0.0363,  ..., -0.0516, -0.0429,  0.0070],
        [ 0.0250, -0.0156, -0.0318,  ...,  0.0319, -0.0475,  0.0198]])
torch.float32


In [12]:
model_path = "/kaggle/input/gpt2/pytorch/lora-finetuned/1/gpt2_lorafinetuned.pt"
import os
def get_size(path):
    return os.path.getsize(path) / 1e6  # MB
    
print(f"Model size before qunatization: {get_size(model_path):.2f} MB")

Model size before qunatization: 664.95 MB


## Post Training Quantization
WorkFLow
1. original model -> model (in `float32`)
2. calibration:
   - run a few batches of data through the model to capture **activation statistics** (min/max) which are then used for quantization of model
   - these statistics are used to calculate quantized weights and activations
3. Quantization mapping:
   - for each layer/tensor, conversion from float to int using scale and zero-point.

**Types of PTQ**
1. Static
2. Dynamic

### How to do quantize?
1. insert min-max observers in the model using *quantization-aware* modules like `QuantStub` and `DeQuantStub`
2. copy weights from the unquantized model
3. specify the quantization configuration
4. prepre the model for calibration
5. convert to quantized version

From pytorch docs: https://pytorch.org/blog/introduction-to-quantization-on-pytorch/
1. set quantization config for server (x86) deployment
`myModel.qconfig = torch.quantization.get_default_config('fbgemm')`
2. insert observers
`torch.quantization.prepare(myModel, inplace=True)`
3. Calibrate the model and collect statistics
`torch.quantization.convert(myModel, inplace=True) #convert to quantized version`

In [13]:
from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig
from torch.ao.quantization import get_default_qconfig
# qunatization configuration
gpt2.qconfig = torch.ao.quantization.default_qconfig

# Override specifically for embedding layers
for name, module in gpt2.named_modules():
    if isinstance(module, torch.nn.Embedding):
        module.qconfig = float_qparams_weight_only_qconfig

# insert observers
gpt2 = torch.quantization.prepare(gpt2)
gpt2

GPTModelQuantized(
  (tok_emb): Embedding(
    50257, 768
    (activation_post_process): PlaceholderObserver(dtype=torch.float32, is_dynamic=False)
  )
  (pos_emb): Embedding(
    1024, 768
    (activation_post_process): PlaceholderObserver(dtype=torch.float32, is_dynamic=False)
  )
  (emb_dropout): Dropout(p=0.1, inplace=False)
  (main_quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (main_dequant): DeQuantStub()
  (trf_blocks): ModuleList(
    (0-11): 12 x TransformerBlock(
      (ln1): LayerNorm(
        (768,), eps=1e-05, elementwise_affine=True
        (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
      )
      (attn): CausalMultiHeadAttention(
        (qkv): LinearWithLoRA(
          (linear): Linear(
            in_features=768, out_features=2304, bias=True
            (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
          )
          (lora): LoRALayer()
        )
        (out_proj): 

In [14]:
# calibarate using the test set
# get the test data
import tiktoken
from fine_tune.config import TrainingConfig
from fine_tune.dataset import download_dataset,load_subset,train_test_split, create_dataloader

config = TrainingConfig()
full_dataset = download_dataset(config)
sub_dataset = load_subset(full_dataset, config)
# split
train_data, test_data, val_data = train_test_split(sub_dataset,config)

tokenizer = tiktoken.get_encoding('gpt2')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

#dataloaders
train_loader, test_loader, val_loader = create_dataloader(
    train_data, test_data, val_data, tokenizer, config, device=device
)

Data Loaded Successfully
Number of entries in dataset: 51760
Subset Data loaded successfully
Number of entries in subset dataset: 3000
Train set size: 2550
Test set size: 150
Validation set size: 300


In [15]:
for idx, (X, y) in enumerate(test_loader):
    print(f'Input: {X} \n Target: {y}')
    break

Input: tensor([[21106,   318,   281,  ...,  2095,  7138,    13],
        [21106,   318,   281,  ..., 50256, 50256, 50256],
        [21106,   318,   281,  ..., 50256, 50256, 50256],
        ...,
        [21106,   318,   281,  ..., 50256, 50256, 50256],
        [21106,   318,   281,  ..., 50256, 50256, 50256],
        [21106,   318,   281,  ..., 50256, 50256, 50256]], device='cuda:0') 
 Target: tensor([[  318,   281, 12064,  ...,  7138,    13, 50256],
        [  318,   281, 12064,  ...,  -100,  -100,  -100],
        [  318,   281, 12064,  ...,  -100,  -100,  -100],
        ...,
        [  318,   281, 12064,  ...,  -100,  -100,  -100],
        [  318,   281, 12064,  ...,  -100,  -100,  -100],
        [  318,   281, 12064,  ...,  -100,  -100,  -100]], device='cuda:0')


In [16]:
import torch
from tqdm import tqdm
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def calibrate_model(model, dataloader, n_batches= 50):
    model.eval()
    with torch.no_grad():
        for i, (batch_input, batch_label) in enumerate(tqdm(dataloader, desc= "Calibrating")):
            if i >= n_batches:
                break
            input_ids = batch_input
            input_ids = input_ids.to(next(model.parameters()).device)
            _ = model(input_ids)

In [17]:
calibrate_model(gpt2, test_loader)

Calibrating: 100%|██████████| 19/19 [02:04<00:00,  6.56s/it]


In [18]:
print(f'Check statistics of the various layers')
gpt2

Check statistics of the various layers


GPTModelQuantized(
  (tok_emb): Embedding(
    50257, 768
    (activation_post_process): PlaceholderObserver(dtype=torch.float32, is_dynamic=False)
  )
  (pos_emb): Embedding(
    1024, 768
    (activation_post_process): PlaceholderObserver(dtype=torch.float32, is_dynamic=False)
  )
  (emb_dropout): Dropout(p=0.1, inplace=False)
  (main_quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=-4.535839557647705, max_val=3.9612929821014404)
  )
  (main_dequant): DeQuantStub()
  (trf_blocks): ModuleList(
    (0): TransformerBlock(
      (ln1): LayerNorm(
        (768,), eps=1e-05, elementwise_affine=True
        (activation_post_process): MinMaxObserver(min_val=-0.7718712091445923, max_val=0.8626592755317688)
      )
      (attn): CausalMultiHeadAttention(
        (qkv): LinearWithLoRA(
          (linear): Linear(
            in_features=768, out_features=2304, bias=True
            (activation_post_process): MinMaxObserver(min_val=-10.400775909423828, max_val=11.13712501

In [19]:
# quantize the model using the statistics collected
quantized_gpt2 = torch.quantization.convert(gpt2, inplace=True)

In [20]:
print(f'Check statistics of the various layers')
quantized_gpt2

Check statistics of the various layers


GPTModelQuantized(
  (tok_emb): QuantizedEmbedding(num_embeddings=50257, embedding_dim=768, dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams)
  (pos_emb): QuantizedEmbedding(num_embeddings=1024, embedding_dim=768, dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams)
  (emb_dropout): QuantizedDropout(p=0.1, inplace=False)
  (main_quant): Quantize(scale=tensor([0.0669]), zero_point=tensor([68]), dtype=torch.quint8)
  (main_dequant): DeQuantize()
  (trf_blocks): ModuleList(
    (0): TransformerBlock(
      (ln1): QuantizedLayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): CausalMultiHeadAttention(
        (qkv): LinearWithLoRA(
          (linear): QuantizedLinear(in_features=768, out_features=2304, scale=0.16958977282047272, zero_point=61, qscheme=torch.per_tensor_affine)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): QuantizedLinear(in_features=768, out_features=768, scale=0.274868309497833

In [21]:
print('After Quantization')
print(torch.int_repr(quantized_gpt2.trf_blocks[0].attn.qkv.linear.weight()))

After Quantization
tensor([[-21,   4,   0,  ..., -12,   7, -18],
        [-12,   7,   3,  ...,  -1,  10,  -9],
        [ -4,  11,  16,  ...,   9,   5, -11],
        ...,
        [  2,  -2,   5,  ...,   0,   1,   0],
        [ -3,  -1,   2,  ...,  -2,  -2,   0],
        [  1,  -1,  -1,  ...,   1,  -2,   1]], dtype=torch.int8)


In [22]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp_delme.p")
    print('Size (MB):', os.path.getsize("temp_delme.p")/1e6)
    os.remove('temp_delme.p')

In [23]:
print_size_of_model(quantized_gpt2)

Size (MB): 176.690735
