<a href="https://colab.research.google.com/github/adithyab100/smoothquant-mixedprecision/blob/main/examples/smoothquant_group_size_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SmoothQuant Group Size Analysis

In this notebook, we analyze the impact of different group sizes on model perplexity when using group quantization in SmoothQuant.

In [1]:
!git clone https://github.com/adithyab100/smoothquant-mixedprecision.git
%cd smoothquant-mixedprecision
!pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
!pip install transformers==4.36.0 accelerate datasets zstandard
!python setup.py install

Cloning into 'smoothquant-mixedprecision'...
remote: Enumerating objects: 529, done.[K
remote: Counting objects: 100% (333/333), done.[K
remote: Compressing objects: 100% (178/178), done.[K
remote: Total 529 (delta 234), reused 198 (delta 155), pack-reused 196 (from 1)[K
Receiving objects: 100% (529/529), 6.96 MiB | 13.10 MiB/s, done.
Resolving deltas: 100% (321/321), done.
/content/smoothquant-mixedprecision
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Collecting torch==1.12.1+cu113
  Downloading https://download.pytorch.org/whl/cu113/torch-1.12.1%2Bcu113-cp310-cp310-linux_x86_64.whl (1837.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.13.1+cu113
  Downloading https://download.pytorch.org/whl/cu113/torchvision-0.13.1%2Bcu113-cp310-cp310-linux_x86_64.whl (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4

In [2]:
%reload_ext autoreload
%autoreload 2

import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import (
    LlamaAttention,
    LlamaDecoderLayer,
    LlamaForCausalLM,
    LlamaMLP,
)
import matplotlib.pyplot as plt
from transformers import LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from smoothquant.smooth import smooth_lm
from smoothquant.fake_quant import quantize_llama_like, quantize_opt
from smoothquant.model_size import get_model_size

import tqdm
import gc
from functools import partial
from datasets import load_dataset

Byte = 8
KiB = 1024 * Byte
MiB = 1024 * KiB
GiB = 1024 * MiB


In [3]:
class Evaluator:
    def __init__(self, dataset, tokenizer, device, n_samples=10):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.device = device

        self.dataset = tokenizer(
            "\n\n".join(dataset["text"]), return_tensors="pt"
        ).input_ids.to(device)

        self.n_samples = n_samples

    @torch.no_grad()
    def evaluate(self, model):
        model.eval()
        nlls = []
        n_samples = self.n_samples if self.n_samples else self.dataset.size(1) // 2048
        for i in tqdm.tqdm(range(n_samples), desc="Evaluating..."):
            batch = self.dataset[:, (i * 2048) : ((i + 1) * 2048)].to(model.device)
            with torch.no_grad():
                lm_logits = model(batch).logits
            shift_logits = lm_logits[:, :-1, :].contiguous().float()
            shift_labels = self.dataset[:, (i * 2048) : ((i + 1) * 2048)][:, 1:]
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )
            neg_log_likelihood = loss.float() * 2048
            nlls.append(neg_log_likelihood)

        return torch.exp(torch.stack(nlls).sum() / (n_samples * 2048))

In [4]:
def get_calib_dataset(tokenizer=None, n_samples=256, block_size=512):
    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split="validation")
    dataset = dataset.shuffle(seed=42)
    samples = []
    n_run = 0
    for data in dataset:
        line = data["text"]
        line = line.strip()
        line_encoded = tokenizer.encode(line)
        if len(line_encoded) > block_size:
            continue
        sample = torch.tensor([line_encoded])
        if sample.numel() == 0:
            continue
        samples.append(sample)
        n_run += 1
        if n_run == n_samples:
            break

    # now concatenate all samples and split according to block size
    cat_samples = torch.cat(samples, dim=1)
    n_split = cat_samples.shape[1] // block_size
    print(f" * Split into {n_split} blocks")
    return [cat_samples[:, i*block_size:(i+1)*block_size] for i in range(n_split)]

@torch.no_grad()
def get_calib_feat(model, tokenizer):
    input_dict = dict()
    def stat_input_max_hook(m, x, y, name):
        if isinstance(x, tuple):
            x = x[0]
        x_max = x.view(-1, x.shape[-1]).abs().mean(dim=0).cpu().detach()
        if name not in input_dict:
            input_dict[name] = [x_max]
        else:
            input_dict[name] += [x_max]

    hooks = []
    for name, m in model.named_modules():
        if isinstance(m, nn.Linear):
            hooks.append(
                m.register_forward_hook(
                    partial(stat_input_max_hook, name=name)))

    print("Collecting activation scales...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    samples = get_calib_dataset(tokenizer)
    pbar = tqdm.tqdm(samples)
    for input_ids in pbar:
        input_ids = input_ids.to(device)
        model(input_ids)

    for hook in hooks:
        hook.remove()
    return input_dict

In [5]:
def evaluate_group_size(model_path, group_sizes, salient_prop, device="cuda" if torch.cuda.is_available() else "cpu"):
    # Load tokenizer and dataset
    # Configuration
    model_fp16 = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

    tokenizer = LlamaTokenizer.from_pretrained(model_path)
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    evaluator = Evaluator(dataset, tokenizer, device, n_samples=40)
    input_feat = get_calib_feat(model_fp16, tokenizer)
    perplexities = []
    model_sizes = []

    for group_size in group_sizes:
        print(f"\nTesting group size: {group_size}")


        model_fp16 = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16,  device_map=None)
        model_fp16 = model_fp16.to("cuda" if torch.cuda.is_available() else "cpu")
        model_w4a4 = quantize_llama_like(model_fp16, weight_quant="per_group", act_quant="per_group", input_feat=input_feat, salient_prop = salient_prop, quant_bits=4, group_size = group_size)

        # get model size
        model_sz = get_model_size(model_w4a4, data_width = 4, salient_prop = salient_prop, group_size = group_size)
        model_sizes.append(model_sz / MiB) # can change depending on unit

        # Evaluate
        ppl = evaluator.evaluate(model_w4a4)
        perplexities.append(ppl)
        print(f"Perplexity for group size {group_size}: {ppl}")

        # Clear memory
        del model_fp16
        del model_w4a4
        gc.collect()
        torch.cuda.empty_cache()

    return perplexities, model_sizes

In [7]:
model_path = "NousResearch/Llama-2-7b-hf"
group_sizes = [4, 8, 16, 32, 64, 128, 256, 512, 1024]  # Different group sizes to test

# Run evaluation
perplexities = evaluate_group_size(model_path, group_sizes, 0)

# # Plot results
# plot_results(group_sizes, perplexities)

# Print final results
# print("\nFinal Results:")
# for size, ppl in zip(group_sizes, perplexities):
#     print(f"Group Size: {size}, Perplexity: {ppl}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Collecting activation scales...
 * Split into 39 blocks


100%|██████████| 39/39 [00:08<00:00,  4.41it/s]



Testing group size: 4


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [None]:
# OPT-3 VALEUS
group_sizes = [4, 8, 16, 32, 64, 128, 256]
# with salient weight prop = 0
perplexity_000 = [15.254583358764648, 15.688404083251953, 16.14592170715332, 17.27976417541504, 28.903629302978516, 71.420654296875, 2041.39453125]
# salient weight prop = 0.01
perplexity_001 = [15.390620231628418, 15.739761352539062, 16.02465057373047, 16.346904754638672, 16.42510414123535, 18.289968490600586, 27.982759475708008]
# salient weight prop = 0.05
perplexity_005= [15.335508346557617, 15.843849182128906, 16.056209564208984, 16.3746395111084, 16.626380920410156, 17.268110275268555, 17.88892364501953]
# salient weight prop = 0.1
perplexity_01 = [15.396620750427246, 15.694783210754395, 15.999234199523926, 16.205415725708008, 16.466209411621094, 16.95526695251465, 17.6525936126709]

In [None]:
import matplotlib.pyplot as plt

def plot_results(group_sizes, perplexity_000, perplexity_001, perplexity_005, perplexity_01, output_path="group_size_perplexity_pretty.png"):
    plt.figure(figsize=(10, 6))

    # Plot the results with more distinct colors and line styles
    plt.plot(group_sizes, perplexity_000, marker='o', label='Salient Prop = 0', color='darkblue', linestyle='-', markersize=8, linewidth=2)
    plt.plot(group_sizes, perplexity_001, marker='s', label='Salient Prop = 0.01', color='mediumseagreen', linestyle='--', markersize=8, linewidth=2)
    plt.plot(group_sizes, perplexity_005, marker='^', label='Salient Prop = 0.05', color='orange', linestyle='-.', markersize=8, linewidth=2)
    plt.plot(group_sizes, perplexity_01, marker='d', label='Salient Prop = 0.1', color='darkviolet', linestyle=':', markersize=8, linewidth=2)

    # Set labels and title
    plt.xlabel('Group Size', fontsize=12)
    plt.ylabel('Perplexity', fontsize=12)
    plt.title('Model (OPT-1.3B) Perplexity vs. Group Size for Different Salient Proportions', fontsize=14)

    # Apply log scale for both axes
    plt.yscale('log')  # Log scale for perplexity
    plt.xscale('log')  # Log scale for group size

    # Set the grid and customize ticks
    plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.7)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)

    # Add a legend
    plt.legend(title="Salient Proportion", loc='upper left', fontsize=10)

    # Save the plot
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

# Provided values
group_sizes = [4, 8, 16, 32, 64, 128, 256]
perplexity_000 = [15.254583358764648, 15.688404083251953, 16.14592170715332, 17.27976417541504, 28.903629302978516, 71.420654296875, 2041.39453125]
perplexity_001 = [15.390620231628418, 15.739761352539062, 16.02465057373047, 16.346904754638672, 16.42510414123535, 18.289968490600586, 27.982759475708008]
perplexity_005 = [15.335508346557617, 15.843849182128906, 16.056209564208984, 16.3746395111084, 16.626380920410156, 17.268110275268555, 17.88892364501953]
perplexity_01 = [15.396620750427246, 15.694783210754395, 15.999234199523926, 16.205415725708008, 16.466209411621094, 16.95526695251465, 17.6525936126709]

# Call the function
plot_results(group_sizes, perplexity_000, perplexity_001, perplexity_005, perplexity_01, output_path="group_size_perplexity_no_labels.png")


In [None]:
# Run evaluation
perplexities = evaluate_group_size(model_path, [4])

