<a href="https://colab.research.google.com/github/adithyab100/smoothquant-mixedprecision/blob/main/examples/smoothquant_group_size_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SmoothQuant Group Size Analysis

In this notebook, we analyze the impact of different group sizes on model perplexity when using group quantization in SmoothQuant.

In [1]:
!git clone https://github.com/adithyab100/smoothquant-mixedprecision.git
%cd smoothquant-mixedprecision
!pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
!pip install transformers==4.36.0 accelerate datasets zstandard
!python setup.py install

Cloning into 'smoothquant-mixedprecision'...
remote: Enumerating objects: 485, done.[K
remote: Counting objects: 100% (290/290), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 485 (delta 206), reused 187 (delta 153), pack-reused 195 (from 1)[K
Receiving objects: 100% (485/485), 6.94 MiB | 22.15 MiB/s, done.
Resolving deltas: 100% (292/292), done.
/content/smoothquant-mixedprecision
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Collecting torch==1.12.1+cu113
  Downloading https://download.pytorch.org/whl/cu113/torch-1.12.1%2Bcu113-cp310-cp310-linux_x86_64.whl (1837.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.13.1+cu113
  Downloading https://download.pytorch.org/whl/cu113/torchvision-0.13.1%2Bcu113-cp310-cp310-linux_x86_64.whl (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4

In [10]:
%reload_ext autoreload
%autoreload 2

import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import (
    LlamaAttention,
    LlamaDecoderLayer,
    LlamaForCausalLM,
    LlamaMLP,
)
from transformers import LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from smoothquant.smooth import smooth_lm
from smoothquant.fake_quant import quantize_llama_like, quantize_opt
import tqdm
import gc
from functools import partial
from datasets import load_dataset


In [5]:
class Evaluator:
    def __init__(self, dataset, tokenizer, device, n_samples=10):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.device = device

        self.dataset = tokenizer(
            "\n\n".join(dataset["text"]), return_tensors="pt"
        ).input_ids.to(device)

        self.n_samples = n_samples

    @torch.no_grad()
    def evaluate(self, model):
        model.eval()
        nlls = []
        n_samples = self.n_samples if self.n_samples else self.dataset.size(1) // 2048
        for i in tqdm.tqdm(range(n_samples), desc="Evaluating..."):
            batch = self.dataset[:, (i * 2048) : ((i + 1) * 2048)].to(model.device)
            with torch.no_grad():
                lm_logits = model(batch).logits
            shift_logits = lm_logits[:, :-1, :].contiguous().float()
            shift_labels = self.dataset[:, (i * 2048) : ((i + 1) * 2048)][:, 1:]
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )
            neg_log_likelihood = loss.float() * 2048
            nlls.append(neg_log_likelihood)

        return torch.exp(torch.stack(nlls).sum() / (n_samples * 2048))

In [12]:
def get_calib_dataset(tokenizer=None, n_samples=256, block_size=512):
    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split="validation")
    dataset = dataset.shuffle(seed=42)
    samples = []
    n_run = 0
    for data in dataset:
        line = data["text"]
        line = line.strip()
        line_encoded = tokenizer.encode(line)
        if len(line_encoded) > block_size:
            continue
        sample = torch.tensor([line_encoded])
        if sample.numel() == 0:
            continue
        samples.append(sample)
        n_run += 1
        if n_run == n_samples:
            break

    # now concatenate all samples and split according to block size
    cat_samples = torch.cat(samples, dim=1)
    n_split = cat_samples.shape[1] // block_size
    print(f" * Split into {n_split} blocks")
    return [cat_samples[:, i*block_size:(i+1)*block_size] for i in range(n_split)]

@torch.no_grad()
def get_calib_feat(model, tokenizer):
    input_dict = dict()
    def stat_input_max_hook(m, x, y, name):
        if isinstance(x, tuple):
            x = x[0]
        x_max = x.view(-1, x.shape[-1]).abs().mean(dim=0).cpu().detach()
        if name not in input_dict:
            input_dict[name] = [x_max]
        else:
            input_dict[name] += [x_max]

    hooks = []
    for name, m in model.named_modules():
        if isinstance(m, nn.Linear):
            hooks.append(
                m.register_forward_hook(
                    partial(stat_input_max_hook, name=name)))

    print("Collecting activation scales...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    samples = get_calib_dataset(tokenizer)
    pbar = tqdm.tqdm(samples)
    for input_ids in pbar:
        input_ids = input_ids.to(device)
        model(input_ids)

    for hook in hooks:
        hook.remove()
    return input_dict

In [13]:
def evaluate_group_size(model_path, group_sizes, device="cuda" if torch.cuda.is_available() else "cpu"):
    # Load tokenizer and dataset
    # Configuration
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    evaluator = Evaluator(dataset, tokenizer, device, n_samples=40)
    input_feat = get_calib_feat(model, tokenizer)
    perplexities = []

    for group_size in group_sizes:
        print(f"\nTesting group size: {group_size}")


        model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
        model_w4a4 = quantize_opt(model, weight_quant="per_group", act_quant="per_group", input_feat=input_feat, salient_prop = 0, quant_bits=4, group_size = group_size)

        # Evaluate
        ppl = evaluator.evaluate(model_w4a4)
        perplexities.append(ppl)
        print(f"Perplexity for group size {group_size}: {ppl}")

        # Clear memory
        del model
        del model_w4a4
        gc.collect()
        torch.cuda.empty_cache()

    return perplexities

In [8]:
def plot_results(group_sizes, perplexities, output_path="group_size_perplexity.png"):
    plt.figure(figsize=(10, 6))
    plt.plot(group_sizes, perplexities, 'bo-')
    plt.xlabel('Group Size')
    plt.ylabel('Perplexity')
    plt.title('Model Perplexity vs. Group Size')
    plt.grid(True)
    plt.yscale('log')  # Log scale for perplexity
    plt.xscale('log')  # Log scale for group size
    plt.savefig(output_path)
    plt.close()

In [None]:
model_path = "facebook/opt-1.3b"
group_sizes = [8, 16, 32, 64, 128]  # Different group sizes to test

# Run evaluation
perplexities = evaluate_group_size(model_path, group_sizes)

# Plot results
plot_results(group_sizes, perplexities)

# Print final results
print("\nFinal Results:")
for size, ppl in zip(group_sizes, perplexities):
    print(f"Group Size: {size}, Perplexity: {ppl}")



Collecting activation scales...
 * Split into 33 blocks


100%|██████████| 33/33 [00:07<00:00,  4.57it/s]



Testing group size: 8


Quantizing OPT model: 270it [00:54,  4.93it/s]
Evaluating...: 100%|██████████| 40/40 [00:51<00:00,  1.28s/it]


Perplexity for group size 8: 15.688404083251953

Testing group size: 16


Quantizing OPT model: 18it [00:03,  5.07it/s]