Support Weight-Only quantization on CPU device with QBits backend (#437)

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com> Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com> Co-authored-by: Casper Hansen <casperbh.96@gmail.com>
casper-hansen · Jun 8, 2024 · 2627364 · 2627364
1 parent 6a46ad6
commit 2627364
Show file tree

Hide file tree

Showing 15 changed files with 415 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -232,6 +232,29 @@ GPU: 2x NVIDIA GeForce RTX 4090
 | Mixtral | 46.7B | 🔵GEMM        |            1 |             2048 |            2048 |           2446.15  |           77.0516 | 27.98 GB (59.15%) |
 | Mixtral | 46.7B | 🔵GEMM        |            1 |             4096 |            4096 |           1985.78  |           77.5689 | 34.65 GB (73.26%) |
 
+### CPU
+
+- CPU: INTEL(R) XEON(R) PLATINUM 8592+ with 8-channel 4800MT/s memory.
+- Command: `python examples/benchmark.py --model_path <hf_model> --batch_size 1`
+
+|   Model | Size | Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (RAM) |
+|--------:|------:|-----------:|-------------:|-----------------:|----------------:|---------------:|:------------------|
+| Mixtral |   7B | 1          | 64             | 64            | 389.24           | 16.01           | 5.59 GB (0.02%) |
+| Mixtral |   7B | 1          | 2048             | 2048            | 1412           | 17.76         | 6.29 GB (0.03%) |
+| Vicuna  |   7B | 1          | 64             | 64            | 346           | 18.13         | 8.18 GB (0.03%) |
+| Vicuna  |   7B | 1          | 2048             | 2048            | 1023.4           | 18.18         | 8.80 GB (0.04%) |
+| LLaMA2  |   13B | 1          | 64             | 64            | 160.24           | 9.87         | 14.65 GB (0.06%) |
+| LLaMA2  |   13B | 1          | 2048             | 2048            | 592.35           | 9.93         | 16.87 GB (0.07%) |
+| Mosaicml  | 7B | 1          | 64             | 64            | 433.17           | 18.79         | 4.60 GB (0.02%) |
+| Mosaicml  | 7B | 1          | 2048             | 2048            | 404.25           | 19.91         | 4.75 GB (0.02%) |
+| Falcon  | 7B | 1          | 64             | 64            | 303.16           | 14.41         | 5.18 GB (0.02%) |
+| Falcon  | 7B | 1          | 2048             | 2048            | 634.57           | 15.55         | 5.80 GB (0.02%) |
+| CodeLlama  | 34B | 1          | 64             | 64            | 153.73           | 4.23         | 29.00 GB (0.12%) |
+| CodeLlama  | 34B | 1          | 2048             | 2048            | 274.25           | 4.38         | 35.21 GB (0.15%) |
+| Deepseek-coder  | 33B | 1          | 64             | 64            | 83.08           | 4.07         | 22.16 GB (0.09%) |
+| Deepseek-coder  | 33B | 1          | 2048             | 2048            | 296.04           | 4.33         | 37.05 GB |
+
+
 ## Reference
 
 If you find AWQ useful or relevant to your research, you can cite their [paper](https://arxiv.org/abs/2306.00978):

diff --git a/awq/models/auto.py b/awq/models/auto.py
@@ -82,6 +82,7 @@ def from_quantized(
         fuse_layers=True,
         use_exllama=False,
         use_exllama_v2=False,
+        use_qbits=False,
         batch_size=1,
         safetensors=True,
         device_map="balanced",
@@ -109,6 +110,7 @@ def from_quantized(
             fuse_layers=fuse_layers,
             use_exllama=use_exllama,
             use_exllama_v2=use_exllama_v2,
+            use_qbits=use_qbits,
             safetensors=safetensors,
             device_map=device_map,
             max_memory=max_memory,

diff --git a/awq/models/base.py b/awq/models/base.py
@@ -1,6 +1,7 @@
 import os
 import gc
 import json
+import logging
 import torch
 import transformers
 import torch.nn as nn
@@ -15,19 +16,22 @@
 from awq.modules.linear import (
     WQLinear_GEMM,
     WQLinear_GEMV,
+    WQLinear_QBits,
     WQLinear_Marlin,
     WQLinear_Exllama,
     WQLinear_ExllamaV2,
     WQLinear_GEMVFast,
     marlin_post_init,
     exllama_post_init,
     exllamav2_post_init,
+    qbits_post_init,
 )
 from awq.utils.module import (
     get_named_linears,
     set_op_by_name,
     exclude_layers_to_not_quantize,
 )
+from awq.utils.utils import get_best_device, qbits_available
 from transformers import (
     AutoConfig,
     PreTrainedModel,
@@ -46,6 +50,10 @@
 from awq.quantize.quantizer import AwqQuantizer
 from awq.utils.module import get_named_linears, set_op_by_name
 
+if qbits_available:
+    from intel_extension_for_transformers.qbits import check_isa_supported
+
+
 # Since we support different `AutoModelForxxx` from transformers
 # we need to define a custom mapping dict as below:
 TRANSFORMERS_AUTO_MAPPING_DICT = {
@@ -389,6 +397,9 @@ def from_quantized(
         use_exllama_v2: Annotated[
             bool, Doc("Whether to map the weights to ExLlamaV2 kernels.")
         ] = False,
+        use_qbits: Annotated[
+            bool, Doc("Whether to map the weights to qbits kernels for CPU device.")
+        ] = False,
         device_map: Annotated[
             Union[str, Dict],
             Doc(
@@ -439,6 +450,14 @@ def from_quantized(
                 trust_remote_code=trust_remote_code,
             )
 
+        use_cpu_qbits = use_qbits or get_best_device() == "cpu"
+        if use_cpu_qbits:
+            if not qbits_available:
+                raise ImportError("Please install intel-extension-for-transformers with "
+                                  "`pip install intel-extension-for-transformers` for 'qbits' kernel!")
+
+            fuse_layers = False
+            logging.warn("Unsupport fuse_layers featrue for CPU device with QBits backend!")
         # Prepare WQLinear layers, replace nn.Linear
         self._load_quantized_modules(
             self,
@@ -447,6 +466,7 @@ def from_quantized(
             quant_config.version,
             use_exllama=use_exllama,
             use_exllama_v2=use_exllama_v2,
+            use_qbits=use_cpu_qbits,
         )
 
         model.tie_weights()
@@ -467,9 +487,13 @@ def from_quantized(
         if fuse_layers:
             self.fuse_layers(model)
 
-        if quant_config.version == "marlin":
+        if use_cpu_qbits:
+            dtype = torch.bfloat16 if check_isa_supported("AMX") else torch.float32
+            model.to(dtype=dtype, device="cpu")
+            # repack qweight to match the QBits kernel.
+            model = qbits_post_init(model)
+        elif quant_config.version == "marlin":
             model = marlin_post_init(model)
-
         elif use_exllama:
             # creates q4 handle
             model = exllama_post_init(model)
@@ -507,10 +531,10 @@ def _load_config(
                 ignore_patterns.extend(["*.pt*", "*.bin*", "consolidated*"])
             else:
                 ignore_patterns.append("*.safetensors*")
-            
+
             if download_kwargs is None:
                 download_kwargs = {}
-            
+
             if "ignore_patterns" in download_kwargs:
                 download_kwargs_ignore_patterns = download_kwargs.pop("ignore_patterns")
 
@@ -551,11 +575,11 @@ def _load_config(
         return model_weights_path, config, quant_config
 
     def _load_quantized_modules(
-        self, model, quant_config, version, use_exllama, use_exllama_v2
+        self, model, quant_config, version, use_exllama, use_exllama_v2, use_qbits=False
     ):
         # Real quantization of weights
         assert not (
-            version == "gemv" and (use_exllama or use_exllama_v2)
+            version == "gemv" and (use_exllama or use_exllama_v2 or use_qbits)
         ), "Exllama kernels only support GEMM version."
 
         # Get blocks of model
@@ -577,7 +601,9 @@ def _load_quantized_modules(
 
             # Replace nn.Linear with WQLinear
             for name, module in named_linears.items():
-                if version == "marlin":
+                if use_qbits:
+                    q_linear_module = WQLinear_QBits
+                elif version == "marlin":
                     q_linear_module = WQLinear_Marlin
                 elif use_exllama:
                     q_linear_module = WQLinear_Exllama
@@ -590,13 +616,19 @@ def _load_quantized_modules(
                 elif version == "gemv_fast":
                     q_linear_module = WQLinear_GEMVFast
 
-                q_linear = q_linear_module.from_linear(
-                    module, quant_config.w_bit, quant_config.q_group_size, True
-                )
+                if use_qbits:
+                    q_linear = q_linear_module.from_linear(module,
+                                                           quant_config.w_bit,
+                                                           quant_config.q_group_size,
+                                                           True,
+                                                           has_zero_points=quant_config.zero_point)
+                else:
+                    q_linear = q_linear_module.from_linear(module, quant_config.w_bit, quant_config.q_group_size, True)
                 q_linear.to(next(layer.parameters()).device)
                 set_op_by_name(layer, name, q_linear)
 
-            torch.cuda.empty_cache()
+            if not use_qbits:
+                torch.cuda.empty_cache()
             gc.collect()
 
     @staticmethod

diff --git a/awq/modules/linear/__init__.py b/awq/modules/linear/__init__.py
@@ -1,6 +1,7 @@
 from .exllama import WQLinear_Exllama, exllama_post_init
 from .exllamav2 import WQLinear_ExllamaV2, exllamav2_post_init
 from .gemm import WQLinear_GEMM
+from .gemm_qbits import WQLinear_QBits, qbits_post_init
 from .gemv import WQLinear_GEMV
 from .marlin import WQLinear_Marlin, marlin_post_init
 from .gemv_fast import WQLinear_GEMVFast
diff --git a/awq/modules/linear/gemm_qbits.py b/awq/modules/linear/gemm_qbits.py
@@ -0,0 +1,155 @@
+import torch
+import torch.nn as nn
+from ...utils.packing_utils import reverse_awq_order, unpack_awq
+
+try:
+    from intel_extension_for_transformers import qbits  # with QBits kernels ()
+
+    QBITS_INSTALLED = True
+except:
+    QBITS_INSTALLED = False
+
+BITS_DTYPE_MAPPING = {
+    4: "int4_clip",
+    8: "int8",
+}
+
+
+def convert_dtype_torch2str(dtype):
+    if dtype == torch.int8:
+        return "int8"
+    elif dtype == torch.float:
+        return "fp32"
+    elif dtype == torch.float16:
+        return "fp16"
+    elif dtype == torch.bfloat16:
+        return "bf16"
+    elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]:
+        return dtype
+    else:
+        assert False, "Unsupported pytorch dtype {} to str dtype".format(dtype)
+
+
+class WQLinear_QBits(nn.Module):
+
+    def __init__(self, w_bit, group_size, in_features, out_features, bias, zero_point, dev):
+        super().__init__()
+        assert QBITS_INSTALLED, \
+            "Please install ITREX qbits package with `pip install intel-extension-for-transformers`."
+
+        self.use_bf16 = qbits.check_isa_supported("AMX")
+
+        if w_bit not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2, 3, 4, 8 bits are supported for now.")
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else in_features
+        self.zero_point = zero_point
+        self.scale_dtype = torch.float32
+
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert out_features % (32 // self.w_bit) == 0
+        self.pack_num = 32 // self.w_bit
+
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (in_features // self.group_size, out_features // self.pack_num),
+                dtype=torch.int8,
+                device=dev,
+            ) if self.zero_point else None,
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (in_features // self.group_size, out_features),
+                dtype=torch.bfloat16 if self.use_bf16 else torch.float32,
+                device=dev,
+            ))
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.zeros((out_features), dtype=torch.bfloat16 if self.use_bf16 else torch.float32, device=dev),
+            )
+        else:
+            self.register_buffer(
+                "bias",
+                None,
+            )
+        qweight = torch.zeros((in_features, out_features // self.pack_num), dtype=torch.int32, device=dev)
+        self.register_buffer("qweight", qweight)
+
+    def post_init(self):
+        assert self.qweight.device.type == "cpu"
+
+        intweight, zeros = unpack_awq(self.qweight, self.qzeros, self.w_bit) # weight: k x n zeros: k / group_size x n
+        intweight, zeros = reverse_awq_order(intweight, zeros, self.w_bit) # weight: k x n zeros: k / group_size x n
+        if self.zero_point:
+            intweight = torch.bitwise_and(intweight, (2**self.w_bit) - 1) - (2**(self.w_bit - 1))
+            zeros = torch.bitwise_and(zeros, (2**self.w_bit) - 1) - (2**(self.w_bit - 1))
+        else:
+            intweight = torch.bitwise_and(intweight, (2**self.w_bit) - 1)
+        g_idx = torch.empty(0, dtype=torch.int32)
+
+        self.qweight = qbits.repack_quantized_weight(intweight, self.scales.float(), zeros, g_idx,
+                                                     BITS_DTYPE_MAPPING[self.w_bit],
+                                                     convert_dtype_torch2str(self.scale_dtype),
+                                                     convert_dtype_torch2str(self.scales.dtype), self.zero_point,
+                                                     self.group_size)
+
+    @classmethod
+    def from_linear(cls, linear, w_bit, group_size, init_only=False, scales=None, zeros=None, has_zero_points=False):
+        awq_linear = cls(
+            w_bit,
+            group_size,
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            has_zero_points,
+            linear.weight.device,
+        )
+        if init_only:  # just prepare for loading sd
+            return awq_linear
+
+        raise NotImplementedError("Only inference is supported for Exllama kernels")
+
+    @torch.no_grad()
+    def forward(self, x):
+        assert QBITS_INSTALLED, (
+            "QBits kernels could not be loaded. "
+            "Please install with `pip install intel-extension-for-transformers` and "
+            "refer to the detial https://github.com/intel/intel-extension-for-transformers/blob/main/docs/qbits.md")
+
+        input_dtype = x.dtype
+        out_shape = x.shape[:-1] + (self.out_features,)
+        x = x.view(-1, x.shape[-1])  # convert xd to 2d
+        out_2d_shape = x.shape[:-1] + (self.out_features,)
+
+        outputs = torch.zeros(out_2d_shape, dtype=input_dtype)
+        bias = self.bias if self.bias is not None else torch.empty(
+            0, dtype=torch.bfloat16 if self.use_bf16 else torch.float32)
+
+        qbits.woq_linear(x, self.qweight, bias, outputs, convert_dtype_torch2str(input_dtype),
+                         BITS_DTYPE_MAPPING[self.w_bit], convert_dtype_torch2str(self.scale_dtype), self.zero_point)
+
+        return outputs.view(out_shape)
+
+    def extra_repr(self) -> str:
+        return ("in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.w_bit,
+            self.group_size,
+        ))
+
+
+def qbits_post_init(model):
+    for _, submodule in model.named_modules():
+        if isinstance(submodule, WQLinear_QBits):
+            submodule.post_init()
+
+    return model
diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py
@@ -198,7 +198,8 @@ def _apply_quant(self, module, named_linears: Dict[str, nn.Linear]):
 
             if self.version == "gemm":
                 scales = scales.t().contiguous()
-                zeros = zeros.t().contiguous()
+                if zeros is not None:
+                    zeros = zeros.t().contiguous()
                 q_linear_module = WQLinear_GEMM
 
             elif self.version == "gemv":