From 012c660699ccb92bfa97916f1a18bed0980eae00 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sat, 14 Sep 2024 07:00:20 -0400
Subject: [PATCH 01/37] enable new ipex API

ipex weight is 4D so we cannot transpose

fix dequant

check require grad
---
 bitsandbytes/autograd/_functions.py     |  5 +++-
 bitsandbytes/backends/cpu_xpu_common.py | 17 ++++++-----
 bitsandbytes/nn/modules.py              | 26 ++++++++--------
 bitsandbytes/utils.py                   | 40 ++++++++++++-------------
 4 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 59e26ad09..0abd6b6df 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -583,7 +583,10 @@ def matmul_4bit(
             )
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
         else:
-            out = F.gemv_4bit(A, B.t(), out, state=quant_state)
+            if getattr(quant_state, "ipex", False):
+                out = F.gemv_4bit(A, B, out, state=quant_state)
+            else:
+                out = F.gemv_4bit(A, B.t(), out, state=quant_state)
             if bias is not None:
                 out += bias
             return out
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 0d865b541..78473bdc4 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -438,11 +438,11 @@ def dequantize_4bit_impl(
     if quant_state.nested:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(quant_state, "op_context"):
-        assert quant_state.op_context is not None
-        A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
-        A = A.reshape(-1)
-        absmax = quant_state.op_context.get_scales().reshape(-1)
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
+        A = torch.ops.ipex_prepack.woq_linear_unpack_weight(
+                A, "nf4", quant_state.shape, 2
+            )
+        quant_state.ipex = False
 
     if out is None:
         out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
@@ -510,9 +510,10 @@ def gemm_4bit_impl(
     torch.Tensor:
         GEMM output tensor.
     """
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(state, "op_context"):
-        assert state.op_context is not None
-        output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 5) and getattr(state, "ipex", False):
+        output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
+                    state.new_scales, state.new_zeros, None, None, state.blocksize,
+                    ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
         output = torch.matmul(A, dqB.to(A.dtype))
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ad424a6f4..0635c653d 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -447,20 +447,17 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         """
         if (
             getattr(self.weight, "quant_state", None) is not None
-            and getattr(self.weight.quant_state, "op_context", None) is not None
+            and getattr(self.weight.quant_state, "ipex", False)
         ):
-            context = self.weight.quant_state.op_context
-            self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
+            original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
+                self.weight, "nf4", self.weight.quant_state.shape, 2
+            )
+            self.weight.data = original_weight.data
+            self.weight.quant_state.ipex = False
 
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 
         if getattr(self.weight, "quant_state", None) is not None:
-            if (
-                self.weight.quant_state.absmax.shape.numel() == 0
-                and getattr(self.weight.quant_state, "op_context", None) is not None
-            ):
-                self.weight.quant_state.absmax = context.get_scales().reshape(-1)
-                delattr(self.weight.quant_state, "op_context")
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
 
@@ -468,11 +465,12 @@ def forward(self, x: torch.Tensor):
         # Check if ipex fusion can be used
         if (
             x.device.type == "cpu"
-            and not hasattr(self.weight.quant_state, "op_context")
+            and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
+            and x.requires_grad == False
         ):
-            enable_ipex_fusion(self.weight, self.weight.quant_state)
+            enable_ipex_fusion(self)
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
@@ -499,7 +497,11 @@ def forward(self, x: torch.Tensor):
             x = x.to(self.compute_dtype)
 
         bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
+        if getattr(self.weight.quant_state, "ipex", False):
+            out = bnb.matmul_4bit(x, self.weight, bias=bias, quant_state=self.weight.quant_state)
+        else:
+            out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
+
 
         out = out.to(inp_dtype)
 
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 9e52c915d..b89edd828 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,28 +200,28 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
-def enable_ipex_fusion(weight, quant_state):
+def enable_ipex_fusion(linear):
     from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
 
-    if _ipex_cpu_version_prereq(2, 3):
-        import intel_extension_for_pytorch as ipex
-
-        lowp_mode = ipex.quantization.WoqLowpMode.BF16
-        quant_state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
-            weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
-            ipex.quantization.WoqWeightDtype.NF4,
-            quant_state.shape,  # weight shape
-            quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
-            None,  # zero_points
-            None,  # bias
-            None,  # g_idx
-            None,  # batch_size
-            quant_state.blocksize,
-            int(lowp_mode),
-            -1,  # act_quant_mode. -1 means don't quant activation
-        )
-        quant_state.absmax = torch.Tensor()
-        weight.data = torch.empty([1, 0], dtype=torch.uint8)
+    if _ipex_cpu_version_prereq(2, 5):
+        quant_state = linear.weight.quant_state
+        new_weight, new_scales, new_zeros, _, compensation = \
+                torch.ops.ipex_prepack.woq_linear_pack_weight(
+                    linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+                    "nf4",
+                    quant_state.shape,  # weight shape
+                    quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
+                    None,  # zero_points
+                    None,  # bias
+                    None,  # batch_size
+                    quant_state.blocksize,
+                    2,
+                )
+        linear.weight.data = new_weight.data
+        setattr(linear.weight.quant_state, "ipex", True)
+        setattr(linear.weight.quant_state, "new_scales", new_scales)
+        setattr(linear.weight.quant_state, "new_zeros", new_zeros)
+        setattr(linear.weight.quant_state, "compensation", compensation)
 
 
 class QuantState:

From b8df1aad9414a669e188678b36be304400987a72 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 23 Sep 2024 10:26:22 -0400
Subject: [PATCH 02/37] use ipex op in backward

---
 bitsandbytes/autograd/_functions.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 0abd6b6df..35c2b45de 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -517,7 +517,10 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
 
         # 1. Dequantize
         # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
+        if getattr(quant_state, "ipex", False):
+            output = F.gemv_4bit(A, B, out, state=quant_state)
+        else:
+            output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
 
         # 3. Save state
         ctx.state = quant_state
@@ -548,7 +551,10 @@ def backward(ctx, grad_output):
         # not supported by PyTorch. TODO: create work-around
         # if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
         if req_gradA:
-            grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
+            if getattr(ctx.state, "ipex", False):
+                grad_A = F.gemv_4bit(grad_output, B, None, state=ctx.state)
+            else:
+                grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
 
         return grad_A, grad_B, None, grad_bias, None
 
@@ -575,7 +581,7 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
+    if A.numel() == A.shape[-1] and A.device.type != "cpu" and A.requires_grad == False:
         # CPU backend does not require A to be a vector
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(

From cd7bf2145807932c8a8a499ddb6bb14e47eb24fc Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 27 Sep 2024 12:58:25 -0400
Subject: [PATCH 03/37] enable backward

---
 bitsandbytes/autograd/_functions.py     |  2 +-
 bitsandbytes/backends/cpu.py            |  3 ++-
 bitsandbytes/backends/cpu_xpu_common.py | 12 ++++++++---
 bitsandbytes/functional.py              | 28 ++++++++++++++++++-------
 bitsandbytes/nn/modules.py              |  3 +--
 bitsandbytes/utils.py                   | 24 ++++++++++++++++++---
 6 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 35c2b45de..06683690c 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -552,7 +552,7 @@ def backward(ctx, grad_output):
         # if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
         if req_gradA:
             if getattr(ctx.state, "ipex", False):
-                grad_A = F.gemv_4bit(grad_output, B, None, state=ctx.state)
+                grad_A = F.gemv_4bit(grad_output, B, None, state=ctx.state, backward=True)
             else:
                 grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
 
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 5d38171d5..549808c82 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -163,12 +163,13 @@ def gemv_4bit(
         transposed_A=False,
         transposed_B=False,
         state: QuantState = None,
+        backward=False,
     ) -> torch.Tensor:
         assert_on_cpu([A, B, out])
         if state is None:
             raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
 
-        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
+        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state, backward)
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 78473bdc4..c298962a2 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -486,6 +486,7 @@ def gemm_4bit_impl(
     transposed_A=False,
     transposed_B=False,
     state: QuantState = None,
+    backward=False,
 ) -> torch.Tensor:
     """
     Matrix-matrix multiplication with 4-bit quantization.
@@ -511,9 +512,14 @@ def gemm_4bit_impl(
         GEMM output tensor.
     """
     if ipex_cpu and _ipex_cpu_version_prereq(2, 5) and getattr(state, "ipex", False):
-        output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
-                    state.new_scales, state.new_zeros, None, None, state.blocksize,
-                    ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
+        if backward:
+            output = torch.ops.torch_ipex.woq_linear(A, state.backward_weight, "nf4", torch.Size([state.shape[1], state.shape[0]]),
+                        state.backward_new_scales, state.backward_new_zeros, None, None, state.blocksize,
+                        ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.backward_compensation)
+        else:
+            output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
+                        state.new_scales, state.new_zeros, None, None, state.blocksize,
+                        ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
         output = torch.matmul(A, dqB.to(A.dtype))
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6cf64df28..b53212bfd 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1530,16 +1530,28 @@ def gemv_4bit(
     transposed_A=False,
     transposed_B=False,
     state=None,
+    backward=False,
 ):
     ensure_backend_is_available(A.device.type)
-    return backends[A.device.type].gemv_4bit(
-        A,
-        B,
-        out=out,
-        transposed_A=transposed_A,
-        transposed_B=transposed_B,
-        state=state,
-    )
+    if A.device.type == "cpu":
+        return backends[A.device.type].gemv_4bit(
+            A,
+            B,
+            out=out,
+            transposed_A=transposed_A,
+            transposed_B=transposed_B,
+            state=state,
+            backward=backward,
+        )
+    else:
+        return backends[A.device.type].gemv_4bit(
+            A,
+            B,
+            out=out,
+            transposed_A=transposed_A,
+            transposed_B=transposed_B,
+            state=state,
+        )
 
 
 def igemm(
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 0635c653d..dc00acdaf 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -468,9 +468,8 @@ def forward(self, x: torch.Tensor):
             and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
-            and x.requires_grad == False
         ):
-            enable_ipex_fusion(self)
+            enable_ipex_fusion(self, x.requires_grad)
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index b89edd828..e0810a6e8 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,23 +200,41 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
-def enable_ipex_fusion(linear):
+def enable_ipex_fusion(linear, grad=False):
     from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
 
     if _ipex_cpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight, new_scales, new_zeros, _, compensation = \
+            torch.ops.ipex_prepack.woq_linear_pack_weight(
+                linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+                "nf4",
+                quant_state.shape,  # weight shape
+                quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
+                None,  # zero_points
+                None,  # bias
+                None,  # batch_size
+                quant_state.blocksize,
+                2,
+            )
+        if grad or True:
+            backward_new_weight, backward_new_scales, backward_new_zeros, _, backward_compensation = \
                 torch.ops.ipex_prepack.woq_linear_pack_weight(
-                    linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+                    linear.weight.t().data.reshape([quant_state.shape[1], quant_state.shape[0] // 2]),
                     "nf4",
                     quant_state.shape,  # weight shape
-                    quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
+                    quant_state.absmax.view(quant_state.shape[1], quant_state.shape[0] // quant_state.blocksize),  # scales
                     None,  # zero_points
                     None,  # bias
                     None,  # batch_size
                     quant_state.blocksize,
                     2,
                 )
+            setattr(linear.weight.quant_state, "backward_weight", backward_new_weight)
+            setattr(linear.weight.quant_state, "backward_new_scales", backward_new_scales)
+            setattr(linear.weight.quant_state, "backward_new_zeros", backward_new_zeros)
+            setattr(linear.weight.quant_state, "backward_compensation", backward_compensation)
+
         linear.weight.data = new_weight.data
         setattr(linear.weight.quant_state, "ipex", True)
         setattr(linear.weight.quant_state, "new_scales", new_scales)

From 5e1901967d6796f192d9817a35b5880498b787fa Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 15 Oct 2024 11:02:57 +0800
Subject: [PATCH 04/37] Multi backend refactor (#8)

* AMD: Clarify diagnostic messages; free up disk space for CI build

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* check grad before using ipex (#1358)

* Enable packaging for ROCm 6.2 (#1367)

* Enable 6.2 build

* Update documentation for 6.2.0 pip install

* Update for VS2022 17.11 compatibility with CUDA < 12.4 (#1341)

* Update for VS2022 17.11 compatibility with CUDA < 12.4

* Try again

* Enable continuous releases for multi-backend-refactor branch

* Update release workflow

* Publish continuous release for multi-backend

* continuous release: revert wheel renaming due to install err

* Revert "continuous release: revert wheel renaming due to install err"

This reverts commit 0a2b5392ff079645fdc9ff887f80d327f9e874f7.

* add dynamic tag-based versioning + git hash for dev vers

* docs: update w/ changes from `main`

* get tags for dynamic versioning

* fine-tune continuous release params

* reduce the pkg size + build times for the preview release

* refine docs for multi-backend alpha release (#1380)

* refine docs for multi-backend alpha release

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: add multi-backend feedback links

* docs: add request for contributions

* docs: small fixes

* docs: small fixes

* docs: add info about `main` continuous build

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: remove 2 obsolete lines

---------

Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
---
 .github/workflows/python-package.yml |  75 ++++++++-
 .gitignore                           |   2 +
 CMakeLists.txt                       |   5 +
 bitsandbytes/__init__.py             |   5 +-
 bitsandbytes/cextension.py           |  11 +-
 bitsandbytes/diagnostics/cuda.py     |  89 +++++++++--
 bitsandbytes/diagnostics/main.py     |  31 ++--
 bitsandbytes/nn/modules.py           |   1 +
 csrc/ops.hip                         |  26 ++--
 docs/source/contributing.mdx         |   5 +-
 docs/source/installation.mdx         | 225 +++++++++++++++++++++------
 docs/source/non_cuda_backends.mdx    |  19 ++-
 setup.py                             |  32 +++-
 tests/test_functional.py             |   1 +
 14 files changed, 432 insertions(+), 95 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 91e6d82a6..6a2b3f63e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -58,6 +58,7 @@ jobs:
   # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
   ##
   build-shared-libs-cuda:
+    if: github.ref_name != 'multi-backend-refactor'
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
@@ -107,7 +108,7 @@ jobs:
         os: [ubuntu-latest]
         arch: [x86_64]
         rocm_version:
-          ["6.1.2"]
+          ["6.1.2", "6.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
@@ -116,10 +117,23 @@ jobs:
         uses: docker/setup-qemu-action@v2
       - name: Clean up disk space
         run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          sudo rm -rf \
+              /usr/share/dotnet \
+              /opt/ghc \
+              "/usr/local/share/boost" \
+              "$AGENT_TOOLSDIRECTORY" \
+              /opt/hostedtoolcache \
+              /opt/google/chrome \
+              /opt/microsoft/msedge \
+              /opt/microsoft/powershell \
+              /opt/pipx \
+              /usr/lib/mono \
+              /usr/local/julia* \
+              /usr/local/lib/android \
+              /usr/local/lib/node_modules \
+              /usr/local/share/chromium \
+              /usr/local/share/powershell \
+              /usr/share/swift
       - name: Build C++
         run: bash .github/scripts/build-rocm.sh
         env:
@@ -135,7 +149,7 @@ jobs:
   build-wheels:
     needs:
       - build-shared-libs
-      - build-shared-libs-cuda
+      # - build-shared-libs-cuda  reduce the pkg size + build times for the preview release
       - build-shared-libs-rocm
     strategy:
       matrix:
@@ -153,6 +167,13 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1  # shallow clone
+      - name: Fetch tags for dynamic versioning in setup.py
+        run: |
+          git fetch --depth=1 origin --tags
+          echo "Available Git tags:"
+          git tag -n
       - name: Download build artifact
         uses: actions/download-artifact@v4
         with:
@@ -170,7 +191,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: pip
       - run: pip install build wheel
-      - run: python -m build .
+      # for now need to do the below instead of prior `python -m build .`, which didn't allow us to access git tags
+      - run: python -m build --sdist && python -m build --wheel
       - name: Determine and Set Platform Tag, then Tag Wheel
         shell: bash
         run: |
@@ -184,6 +206,45 @@ jobs:
           path: dist/bitsandbytes-*.whl
           retention-days: 7
 
+  upload-pre-release-wheels:
+    name: Create release and upload artifacts
+    runs-on: ubuntu-latest
+    if: github.ref_name == 'multi-backend-refactor'
+    permissions:
+      contents: write
+    needs:
+      - build-wheels
+    steps:
+      - name: Download and rename artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: tmp/
+          pattern: "bdist_wheel_*"
+          merge-multiple: true
+      - name: Inspect tmp directory after downloading artifacts
+        run: ls -alFR tmp/
+      - name: Move and rename wheel files with pattern replacement
+        run: |
+          mkdir -p wheels/
+          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
+            wheel_filename=$(basename "$wheel")
+            # Remove the gith hash, e.g. `+1234567`, for a stable download link on the multi-backend pre-release
+            cleaned_filename=$(echo "$wheel_filename" | sed -E 's/\+[0-9a-f]{7}-/-/g')
+            mv "$wheel" "wheels/$cleaned_filename"
+          done
+      - name: Inspect wheels directory after renaming files
+        run: ls -alFR wheels/
+      - name: Create release and upload artifacts
+        uses: softprops/action-gh-release@v2.0.8
+        with:
+          files: wheels/*.whl
+          prerelease: true
+          name: Multi-Backend Preview
+          tag_name: continuous-release_multi-backend-refactor
+          make_latest: false
+          draft: false
+          target_commitish: ${{ github.sha }}
+
   audit-wheels:
     needs: build-wheels
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 22f5a6cd6..cd1b797bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,6 +151,8 @@ dmypy.json
 # vim
 *.swp
 
+# BNB-specific stuff
 dependencies
 cuda_build
 output/
+bitsandbytes/_version.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eac72fe52..315e0ff1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,11 @@ if(BUILD_CUDA)
     # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
     if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
         string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+
+        # This is needed to build with VS2022 17.11+ and CUDA < 12.4.
+        if (MSVC_VERSION VERSION_GREATER_EQUAL 1941)
+            string(APPEND CMAKE_CUDA_FLAGS " -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH")
+        endif()
     endif()
 
     enable_language(CUDA) # This will fail if CUDA is not found
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 1e638eb79..25ec8a79a 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,6 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Import the dynamically generated version from _version.py  (see setup.py)
+from ._version import __version__  # isort: skip # type: ignore
+
 import torch
 
 from . import research, utils
@@ -73,5 +76,3 @@
     "optim.optimizer.Optimizer8bit": False,
     "optim.optimizer.MockArgs": False,
 }
-
-__version__ = "0.43.3.dev"
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 6c18275c6..cc5d8deff 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -99,7 +99,7 @@ def get_native_library() -> BNBNativeLibrary:
         if cuda_binary_path.exists():
             binary_path = cuda_binary_path
         else:
-            logger.warning("Could not find the bitsandbytes CUDA binary at %r", cuda_binary_path)
+            logger.warning("Could not find the bitsandbytes %s binary at %r", BNB_BACKEND, cuda_binary_path)
     logger.debug(f"Loading bitsandbytes native library from: {binary_path}")
     dll = ct.cdll.LoadLibrary(str(binary_path))
 
@@ -116,21 +116,24 @@ def get_native_library() -> BNBNativeLibrary:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
         BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
+        BNB_BACKEND = "ROCm"
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
         BNB_HIP_VERSION_SHORT = ""
+        BNB_BACKEND = "CUDA"
+
     lib = get_native_library()
 except Exception as e:
     lib = None
     logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
     if torch.cuda.is_available():
         logger.warning(
-            """
-CUDA Setup failed despite CUDA being available. Please run the following command to get more information:
+            f"""
+{BNB_BACKEND} Setup failed despite {BNB_BACKEND} being available. Please run the following command to get more information:
 
 python -m bitsandbytes
 
-Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
+Inspect the output of the command and see if you can locate {BNB_BACKEND} libraries. You might need to add them
 to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
 and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues
 """,
diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py
index 8974c6400..014b753a9 100644
--- a/bitsandbytes/diagnostics/cuda.py
+++ b/bitsandbytes/diagnostics/cuda.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from bitsandbytes.cextension import get_cuda_bnb_library_path
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
 from bitsandbytes.consts import NONPYTORCH_DOC_URL
 from bitsandbytes.cuda_specs import CUDASpecs
 from bitsandbytes.diagnostics.utils import print_dedented
@@ -32,15 +32,20 @@
     "_",  # current Python interpreter
 }
 
-CUDA_RUNTIME_LIB_PATTERNS = (
-    "cudart64*.dll",  # Windows
-    "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
-    "nvcuda*.dll",  # Windows
-)
-
 logger = logging.getLogger(__name__)
 
 
+def get_runtime_lib_patterns() -> tuple:
+    if HIP_ENVIRONMENT:
+        return ("libamdhip64.so*",)
+    else:
+        return (
+            "cudart64*.dll",  # Windows
+            "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
+            "nvcuda*.dll",  # Windows
+        )
+
+
 def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
     for dir_string in paths_list_candidate.split(os.pathsep):
         if not dir_string:
@@ -55,9 +60,9 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
                     continue
             except OSError:  # Assume an esoteric error trying to poke at the directory
                 pass
-            for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
+            for lib_pattern in get_runtime_lib_patterns():
                 for pth in dir.glob(lib_pattern):
-                    if pth.is_file():
+                    if pth.is_file() and not pth.is_symlink():
                         yield pth
         except (OSError, PermissionError):
             pass
@@ -104,7 +109,7 @@ def find_cudart_libraries() -> Iterator[Path]:
         yield from find_cuda_libraries_in_path_list(value)
 
 
-def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
+def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
     print(
         f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
         f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
@@ -149,10 +154,40 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
     # (2) Multiple CUDA versions installed
 
 
-def print_cuda_runtime_diagnostics() -> None:
+def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
+    print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
+
+    binary_path = get_cuda_bnb_library_path(cuda_specs)
+    if not binary_path.exists():
+        print_dedented(
+            f"""
+        Library not found: {binary_path}.
+        Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
+        in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
+        and rebuild bitsandbytes.
+        """,
+        )
+
+    hip_major, hip_minor = cuda_specs.cuda_version_tuple
+    if (hip_major, hip_minor) < (6, 1):
+        print_dedented(
+            """
+            WARNING: bitsandbytes is fully supported only from ROCm 6.1.
+            """,
+        )
+
+
+def print_diagnostics(cuda_specs: CUDASpecs) -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_diagnostics(cuda_specs)
+    else:
+        _print_cuda_diagnostics(cuda_specs)
+
+
+def _print_cuda_runtime_diagnostics() -> None:
     cudart_paths = list(find_cudart_libraries())
     if not cudart_paths:
-        print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
+        print("WARNING! CUDA runtime files not found in any environmental path.")
     elif len(cudart_paths) > 1:
         print_dedented(
             f"""
@@ -174,3 +209,33 @@ def print_cuda_runtime_diagnostics() -> None:
         )
         for pth in cudart_paths:
             print(f"* Found CUDA runtime at: {pth}")
+
+
+def _print_hip_runtime_diagnostics() -> None:
+    cudart_paths = list(find_cudart_libraries())
+    if not cudart_paths:
+        print("WARNING! ROCm runtime files not found in any environmental path.")
+    elif len(cudart_paths) > 1:
+        print_dedented(
+            f"""
+            Found duplicate ROCm runtime files (see below).
+
+            We select the PyTorch default ROCm runtime, which is {torch.version.hip},
+            but this might mismatch with the ROCm version that is needed for bitsandbytes.
+
+            To resolve it, install PyTorch built for the ROCm version you want to use
+
+            and set LD_LIBRARY_PATH to your ROCm install path, e.g.
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
+            """,
+        )
+
+        for pth in cudart_paths:
+            print(f"* Found ROCm runtime at: {pth}")
+
+
+def print_runtime_diagnostics() -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_runtime_diagnostics()
+    else:
+        _print_cuda_runtime_diagnostics()
diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py
index 1ce096f69..8dc43ed2a 100644
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -3,11 +3,12 @@
 
 import torch
 
+from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
 from bitsandbytes.consts import PACKAGE_GITHUB_URL
 from bitsandbytes.cuda_specs import get_cuda_specs
 from bitsandbytes.diagnostics.cuda import (
-    print_cuda_diagnostics,
-    print_cuda_runtime_diagnostics,
+    print_diagnostics,
+    print_runtime_diagnostics,
 )
 from bitsandbytes.diagnostics.utils import print_dedented, print_header
 
@@ -16,12 +17,13 @@ def sanity_check():
     from bitsandbytes.cextension import lib
 
     if lib is None:
+        compute_backend = "cuda" if not HIP_ENVIRONMENT else "hip"
         print_dedented(
-            """
+            f"""
             Couldn't load the bitsandbytes library, likely due to missing binaries.
             Please ensure bitsandbytes is properly installed.
 
-            For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND=cuda -S .`.
+            For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND={compute_backend} -S .`.
             See the documentation for more details if needed.
 
             Trying a simple check anyway, but this will likely fail...
@@ -49,19 +51,24 @@ def main():
 
     print_header("OTHER")
     cuda_specs = get_cuda_specs()
-    print("CUDA specs:", cuda_specs)
+    if HIP_ENVIRONMENT:
+        rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
+        rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
+        print(f"{BNB_BACKEND} specs:{rocm_specs}")
+    else:
+        print(f"{BNB_BACKEND} specs:{cuda_specs}")
     if not torch.cuda.is_available():
-        print("Torch says CUDA is not available. Possible reasons:")
-        print("1. CUDA driver not installed")
-        print("2. CUDA not installed")
-        print("3. You have multiple conflicting CUDA libraries")
+        print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
+        print(f"1. {BNB_BACKEND} driver not installed")
+        print(f"2. {BNB_BACKEND} not installed")
+        print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
     if cuda_specs:
-        print_cuda_diagnostics(cuda_specs)
-    print_cuda_runtime_diagnostics()
+        print_diagnostics(cuda_specs)
+    print_runtime_diagnostics()
     print_header("")
     print_header("DEBUG INFO END")
     print_header("")
-    print("Checking that the library is importable and CUDA is callable...")
+    print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
     try:
         sanity_check()
         print("SUCCESS!")
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index dc00acdaf..e8fc53253 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -468,6 +468,7 @@ def forward(self, x: torch.Tensor):
             and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
+            and x.requires_grad == False
         ):
             enable_ipex_fusion(self, x.requires_grad)
 
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 157e84629..4fdc3cbfa 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -576,6 +576,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
       if (returnedAlgoCount == 0)
       {
         has_error = 1;
+        fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
       }
       else
       {
@@ -614,18 +615,25 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
                                                             heuristicResult,
                                                             &returnedAlgoCount));
 
-      if(!SCALE_ROWS)
+      if (returnedAlgoCount == 0)
       {
-        float alpha = 1.0f, beta = 0.0f;
-
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        has_error = 1;
+        fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
       }
       else
       {
-        //has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
-	float beta = 0.0f;
-
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        if(!SCALE_ROWS)
+        {
+          float alpha = 1.0f, beta = 0.0f;
+
+          has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        }
+        else
+        {
+          float beta = 0.0f;
+
+          has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        }
       }
     }
 
@@ -635,7 +643,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
     if (Adesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
     if (matmulDesc) has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
     if(has_error == 1)
-      printf("error detected");
+      fprintf(stderr, "error detected\n");
 
     return has_error;
 #endif // NO_HIPBLASLT
diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx
index 4fe6b7541..5da42961e 100644
--- a/docs/source/contributing.mdx
+++ b/docs/source/contributing.mdx
@@ -5,8 +5,9 @@
 
 ### Setup pre-commit hooks
 - Install pre-commit hooks with `pip install pre-commit`.
-- Run `pre-commit autoupdate` once to configure the hooks.
-- Re-run `pre-commit autoupdate` every time a new hook got added.
+- Run `pre-commit install` once to install the hooks, so they will be run on every commit.
+- If the hooks introduce changes, they'll be visible with `git diff`. Review them and `git add` them if everything is fine, then re-execute the before commit, it should pass now.
+- If you want to manually trigger the hooks, you may do `pre-commit run --all-files`
 
 Now all the pre-commit hooks will be automatically run when you try to commit and if they introduce some changes, you need to re-add the changed files before being able to commit and push.
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 60419b38a..609865436 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,29 +1,45 @@
-# Installation
+# Installation Guide
 
-## CUDA
+Welcome to the installation guide for the `bitsandbytes` library! This document provides step-by-step instructions to install `bitsandbytes` across various platforms and hardware configurations. The library primarily supports CUDA-based GPUs, but the team is actively working on enabling support for additional backends like AMD ROCm, Intel, and Apple Silicon.
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
+> [!TIP]
+> For a high-level overview of backend support and compatibility, see the [Multi-backend Support](#multi-backend) section.
 
-The latest version of bitsandbytes builds on:
+## Table of Contents
 
-| OS | CUDA | Compiler |
-|---|---|---|
-| Linux | 11.7 - 12.3 | GCC 11.4 |
-|  | 12.4+ | GCC 13.2 |
-| Windows | 11.7 - 12.4 | MSVC 19.38+ (VS2022 17.8.0+) |
+- [CUDA](#cuda)
+  - [Installation via PyPI](#cuda-pip)
+  - [Compile from Source](#cuda-compile)
+- [Multi-backend Support (Alpha Release)](#multi-backend)
+  - [Supported Backends](#multi-backend-supported-backends)
+  - [Pre-requisites](#multi-backend-pre-requisites)
+  - [Installation](#multi-backend-pip)
+  - [Compile from Source](#multi-backend-compile)
+- [PyTorch CUDA Versions](#pytorch-cuda-versions)
 
-> [!TIP]
-> MacOS support is still a work in progress! Subscribe to this [issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about discussions and to track the integration progress.
+## CUDA[[cuda]]
 
-For Linux systems, make sure your hardware meets the following requirements to use bitsandbytes features.
+`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
 
-| **Feature** | **Hardware requirement** |
-|---|---|
-| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
-| 8-bit optimizers/quantization | NVIDIA Kepler (GTX 780 or newer) |
+### Supported CUDA Configurations[[cuda-pip]]
+
+The latest version of `bitsandbytes` builds on the following configurations:
+
+| **OS**      | **CUDA Version** | **Compiler**         |
+|-------------|------------------|----------------------|
+| **Linux**   | 11.7 - 12.3      | GCC 11.4             |
+|             | 12.4+            | GCC 13.2             |
+| **Windows** | 11.7 - 12.4      | MSVC 19.38+ (VS2022) |
+
+For Linux systems, ensure your hardware meets the following requirements:
+
+| **Feature**                     | **Hardware Requirement**                                           |
+|---------------------------------|--------------------------------------------------------------------|
+| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
+| 8-bit optimizers/quantization   | NVIDIA Kepler (GTX 780 or newer)                                    |
 
 > [!WARNING]
-> bitsandbytes >= 0.39.1 no longer includes Kepler binaries in pip installations. This requires manual compilation, and you should follow the general steps and use `cuda11x_nomatmul_kepler` for Kepler-targeted compilation.
+> `bitsandbytes >= 0.39.1` no longer includes Kepler binaries in pip installations. This requires [manual compilation using](#cuda-compile) the `cuda11x_nomatmul_kepler` configuration.
 
 To install from PyPI.
 
@@ -31,14 +47,41 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-### Compile from source[[compile]]
+### `pip install` pre-built wheel from latest `main` commit
+
+If you would like to use new feature even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
+
+<hfoptions id="OS">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.44.2.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+</hfoption>
+</hfoptions>
+
+### Compile from source[[cuda-compile]]
+
+> [!TIP]
+> Don't hesitate to compile from source! The process is pretty straight forward and resilient. This might be needed for older CUDA versions or other less common configurations, which we don't support out of the box due to package size.
 
-For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
+For Linux and Windows systems, compiling from source allows you to customize the build configurations. See below for detailed platform-specific instructions (see the `CMakeLists.txt` if you want to check the specifics and explore some additional options):
 
 <hfoptions id="source">
 <hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+
+For example, to install a compiler and CMake on Ubuntu:
 
 ```bash
 apt-get install -y build-essential cmake
@@ -48,16 +91,16 @@ You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation
 
 Refer to the following table if you're using another CUDA Toolkit version.
 
-| CUDA Toolkit | GCC |
-|---|---|
-| >= 11.4.1 | >= 11 |
-| >= 12.0 | >= 12 |
-| >= 12.4 | >= 13 |
+| CUDA Toolkit |  GCC  |
+|--------------|-------|
+| >= 11.4.1    | >= 11 |
+| >= 12.0      | >= 12 |
+| >= 12.4      | >= 13 |
 
 Now to install the bitsandbytes package from source, run the following commands:
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 make
@@ -81,7 +124,7 @@ Refer to the following table if you're using another CUDA Toolkit version.
 | >= 11.6 | 19.30+ (VS2022) |
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
@@ -93,7 +136,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-### PyTorch CUDA versions
+### PyTorch CUDA versions[[pytorch-cuda-versions]]
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -105,7 +148,7 @@ Some bitsandbytes features may need a newer CUDA version than the one currently
 Then locally install the CUDA version you need with this script from bitsandbytes:
 
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
+wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
@@ -134,28 +177,62 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend[[multi-backend]]
+## Multi-backend Support (Alpha Release)[[multi-backend]]
 
 > [!TIP]
-> This functionality is currently in preview and therefore not yet production-ready!
+> This functionality is currently in preview and not yet production-ready. We very much welcome community feedback, contributions and leadership on topics like Apple Silicon as well as other less common accellerators! For more information, see [this guide on multi-backend support](./non_cuda_backends).
+
+**Link to give us feedback** (bugs, install issues, perf results, requests, etc.)**:**
+
+<hfoptions id="platform">
+<hfoption id="ROCm">
+
+[**Multi-backend refactor: Alpha release (AMD ROCm ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
+
+</hfoption>
+<hfoption id="Intel CPU+GPU">
+
+[**Multi-backend refactor: Alpha release (INTEL ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
+
+</hfoption>
+<hfoption id="Apple Silicon / Metal (MPS)">
 
-Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+[**Github Discussion space on coordinating the kickoff of MPS backend development**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340)
 
-### Pip install the pre-built wheel (recommended for most)
+</hfoption>
+</hfoptions>
 
-WIP (will be added in the coming days)
+### Supported Backends[[multi-backend-supported-backends]]
 
-### Compilation
+| **Backend** | **Supported Versions** | **Python versions** | **Architecture Support** | **Status** |
+|-------------|------------------------|---------------------------|-------------------------|------------|
+| **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
+| **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
+| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+
+For each supported backend, follow the respective instructions below:
+
+### Pre-requisites[[multi-backend-pre-requisites]]
+
+To use bitsandbytes non-CUDA backends, be sure to install:
+
+```
+pip install "transformers>=4.45.1"
+```
 
 <hfoptions id="backend">
 <hfoption id="AMD ROCm">
 
-#### AMD GPU
-
-bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+> [!WARNING]
+> Pre-compiled binaries are only built for ROCm versions `6.1.0`/`6.1.1`/`6.1.2`/`6.2.0` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
+>
+> Other supported versions that don't come with pre-compiled binaries [can be compiled for with these instructions](#multi-backend-compile).
+>
+> **Windows is not supported for the ROCm backend**; also not WSL2 to our knowledge.
 
 > [!TIP]
-> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
+> If you would like to install ROCm and PyTorch on bare metal, skip the Docker steps and refer to ROCm's official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Special note: please make sure to get the respective ROCm-specific PyTorch wheel for the installed ROCm version, e.g. `https://download.pytorch.org/whl/nightly/rocm6.2/`!
 
 ```bash
 # Create a docker container with latest ROCm image, which includes ROCm libraries
@@ -165,12 +242,70 @@ apt-get update && apt-get install -y git && cd home
 
 # Install pytorch compatible with above ROCm version
 pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
+```
 
-# Install bitsandbytes from PyPI
-# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
-# Please install from source if your configuration doesn't match with these)
-pip install bitsandbytes
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+Compatible hardware and functioning `import intel_extension_for_pytorch as ipex` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+> [!TIP]
+> Apple Silicon support is still a WIP. Please visit and write us in [this Github Discussion space on coordinating the kickoff of MPS backend development](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and coordinate a community-led effort to implement this backend.
+
+</hfoption>
+</hfoptions>
+
+### Installation
+
+You can install the pre-built wheels for each backend, or compile from source for custom configurations.
+
+#### Pre-built Wheel Installation (recommended)[[multi-backend-pip]]
+
+<hfoptions id="platform">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
 
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
+```
+
+</hfoption>
+<hfoption id="Mac">
+
+> [!WARNING]
+> bitsandbytes does not yet support Apple Silicon / Metal with a dedicated backend. However, the build infrastructure is in place and the below pip install will eventually provide Apple Silicon support as it becomes available on the `multi-backend-refactor` branch based on community contributions.
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+
+</hfoption>
+</hfoptions>
+
+#### Compile from Source[[multi-backend-compile]]
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+#### AMD GPU
+
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+
+```bash
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
@@ -195,10 +330,10 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 
 Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
 
-The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#cuda-compile).
 
 ```
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index fca586534..728606b7b 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -1,5 +1,8 @@
 # Multi-backend support (non-CUDA backends)
 
+> [!Tip]
+> If you feel these docs need some additional info, please consider submitting a PR or respectfully request the missing info in one of the below mentioned Github discussion spaces.
+
 As part of a recent refactoring effort, we will soon offer official multi-backend support. Currently, this feature is available in a preview alpha release, allowing us to gather early feedback from users to improve the functionality and identify any bugs.
 
 At present, the Intel CPU and AMD ROCm backends are considered fully functional. The Intel XPU backend has limited functionality and is less mature.
@@ -24,4 +27,18 @@ Thank you for your support!
 
 ### Intel
 
-### AMD
+The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
+
+#### Inference (CPU)
+
+| Data Type | BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Speed-Up (vs BF16) | 1.0x | 0.6x | 2.3x | 0.03x |
+| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
+
+#### Fine-Tuning (CPU)
+
+| Data Type | AMP BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Speed-Up (vs AMP BF16) | 1.0x | 0.38x | 0.07x | 0.07x |
+| Memory (GB) | 40 | 9 | 6.6 | 6.6 |
diff --git a/setup.py b/setup.py
index 18de0fe5b..2b1c1aff3 100644
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 import glob
 import os
+import subprocess
 
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
@@ -13,6 +14,35 @@
 print("libs:", libs)
 
 
+def get_git_commit_hash():
+    return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("utf-8").strip()
+
+
+def is_git_tagged_commit():
+    tags = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]).decode("utf-8").strip()
+    return bool(tags)
+
+
+def get_latest_semver_tag():
+    tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
+    semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
+    if not semver_tags:
+        raise ValueError("No valid semantic version tags found")
+    return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
+
+
+def write_version_file(version, filepath="bitsandbytes/_version.py"):
+    with open(filepath, "w") as f:
+        f.write(f'__version__ = "{version}"\n')
+
+
+def get_version_and_write_to_file():
+    latest_semver_tag = get_latest_semver_tag()
+    version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
+    write_version_file(version)
+    return version
+
+
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
@@ -25,7 +55,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.3.dev",
+    version=get_version_and_write_to_file(),
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",
diff --git a/tests/test_functional.py b/tests/test_functional.py
index a9d926b89..35187db78 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2303,6 +2303,7 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
             assert maxratio < 1.02 and maxratio > 0.98
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)

From dd3b745a576f1b55749ff71d18e9631fd69474dd Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 15 Oct 2024 06:59:47 -0400
Subject: [PATCH 05/37] Revert "enable backward"

This reverts commit cd7bf2145807932c8a8a499ddb6bb14e47eb24fc.
---
 bitsandbytes/autograd/_functions.py     |  2 +-
 bitsandbytes/backends/cpu.py            |  3 +--
 bitsandbytes/backends/cpu_xpu_common.py | 12 +++--------
 bitsandbytes/functional.py              | 28 +++++++------------------
 bitsandbytes/nn/modules.py              |  2 +-
 bitsandbytes/utils.py                   | 24 +++------------------
 6 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 06683690c..35c2b45de 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -552,7 +552,7 @@ def backward(ctx, grad_output):
         # if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
         if req_gradA:
             if getattr(ctx.state, "ipex", False):
-                grad_A = F.gemv_4bit(grad_output, B, None, state=ctx.state, backward=True)
+                grad_A = F.gemv_4bit(grad_output, B, None, state=ctx.state)
             else:
                 grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
 
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 549808c82..5d38171d5 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -163,13 +163,12 @@ def gemv_4bit(
         transposed_A=False,
         transposed_B=False,
         state: QuantState = None,
-        backward=False,
     ) -> torch.Tensor:
         assert_on_cpu([A, B, out])
         if state is None:
             raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
 
-        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state, backward)
+        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index c298962a2..78473bdc4 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -486,7 +486,6 @@ def gemm_4bit_impl(
     transposed_A=False,
     transposed_B=False,
     state: QuantState = None,
-    backward=False,
 ) -> torch.Tensor:
     """
     Matrix-matrix multiplication with 4-bit quantization.
@@ -512,14 +511,9 @@ def gemm_4bit_impl(
         GEMM output tensor.
     """
     if ipex_cpu and _ipex_cpu_version_prereq(2, 5) and getattr(state, "ipex", False):
-        if backward:
-            output = torch.ops.torch_ipex.woq_linear(A, state.backward_weight, "nf4", torch.Size([state.shape[1], state.shape[0]]),
-                        state.backward_new_scales, state.backward_new_zeros, None, None, state.blocksize,
-                        ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.backward_compensation)
-        else:
-            output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
-                        state.new_scales, state.new_zeros, None, None, state.blocksize,
-                        ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
+        output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
+                    state.new_scales, state.new_zeros, None, None, state.blocksize,
+                    ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
         output = torch.matmul(A, dqB.to(A.dtype))
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index b53212bfd..6cf64df28 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1530,28 +1530,16 @@ def gemv_4bit(
     transposed_A=False,
     transposed_B=False,
     state=None,
-    backward=False,
 ):
     ensure_backend_is_available(A.device.type)
-    if A.device.type == "cpu":
-        return backends[A.device.type].gemv_4bit(
-            A,
-            B,
-            out=out,
-            transposed_A=transposed_A,
-            transposed_B=transposed_B,
-            state=state,
-            backward=backward,
-        )
-    else:
-        return backends[A.device.type].gemv_4bit(
-            A,
-            B,
-            out=out,
-            transposed_A=transposed_A,
-            transposed_B=transposed_B,
-            state=state,
-        )
+    return backends[A.device.type].gemv_4bit(
+        A,
+        B,
+        out=out,
+        transposed_A=transposed_A,
+        transposed_B=transposed_B,
+        state=state,
+    )
 
 
 def igemm(
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index e8fc53253..0635c653d 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -470,7 +470,7 @@ def forward(self, x: torch.Tensor):
             and self.weight.quant_state.quant_type == "nf4"
             and x.requires_grad == False
         ):
-            enable_ipex_fusion(self, x.requires_grad)
+            enable_ipex_fusion(self)
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index e0810a6e8..b89edd828 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,41 +200,23 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
-def enable_ipex_fusion(linear, grad=False):
+def enable_ipex_fusion(linear):
     from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
 
     if _ipex_cpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight, new_scales, new_zeros, _, compensation = \
-            torch.ops.ipex_prepack.woq_linear_pack_weight(
-                linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
-                "nf4",
-                quant_state.shape,  # weight shape
-                quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
-                None,  # zero_points
-                None,  # bias
-                None,  # batch_size
-                quant_state.blocksize,
-                2,
-            )
-        if grad or True:
-            backward_new_weight, backward_new_scales, backward_new_zeros, _, backward_compensation = \
                 torch.ops.ipex_prepack.woq_linear_pack_weight(
-                    linear.weight.t().data.reshape([quant_state.shape[1], quant_state.shape[0] // 2]),
+                    linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
                     "nf4",
                     quant_state.shape,  # weight shape
-                    quant_state.absmax.view(quant_state.shape[1], quant_state.shape[0] // quant_state.blocksize),  # scales
+                    quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
                     None,  # zero_points
                     None,  # bias
                     None,  # batch_size
                     quant_state.blocksize,
                     2,
                 )
-            setattr(linear.weight.quant_state, "backward_weight", backward_new_weight)
-            setattr(linear.weight.quant_state, "backward_new_scales", backward_new_scales)
-            setattr(linear.weight.quant_state, "backward_new_zeros", backward_new_zeros)
-            setattr(linear.weight.quant_state, "backward_compensation", backward_compensation)
-
         linear.weight.data = new_weight.data
         setattr(linear.weight.quant_state, "ipex", True)
         setattr(linear.weight.quant_state, "new_scales", new_scales)

From 8422f632bee671de639c2c47fcc49036b11bbc85 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 15 Oct 2024 06:59:55 -0400
Subject: [PATCH 06/37] Revert "use ipex op in backward"

This reverts commit b8df1aad9414a669e188678b36be304400987a72.
---
 bitsandbytes/autograd/_functions.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 35c2b45de..0abd6b6df 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -517,10 +517,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
 
         # 1. Dequantize
         # 2. MatmulnN
-        if getattr(quant_state, "ipex", False):
-            output = F.gemv_4bit(A, B, out, state=quant_state)
-        else:
-            output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
+        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
 
         # 3. Save state
         ctx.state = quant_state
@@ -551,10 +548,7 @@ def backward(ctx, grad_output):
         # not supported by PyTorch. TODO: create work-around
         # if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
         if req_gradA:
-            if getattr(ctx.state, "ipex", False):
-                grad_A = F.gemv_4bit(grad_output, B, None, state=ctx.state)
-            else:
-                grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
+            grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
 
         return grad_A, grad_B, None, grad_bias, None
 
@@ -581,7 +575,7 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.device.type != "cpu" and A.requires_grad == False:
+    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
         # CPU backend does not require A to be a vector
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(

From 9cbc081899efea12fcc15699022014700abb47c7 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 21 Oct 2024 11:09:01 -0400
Subject: [PATCH 07/37] fix finetune

---
 bitsandbytes/nn/modules.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 0635c653d..68050d270 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -469,8 +469,11 @@ def forward(self, x: torch.Tensor):
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
             and x.requires_grad == False
+            and getattr(self.weight.quant_state, "initialized", False) == False
         ):
             enable_ipex_fusion(self)
+        else:
+            setattr(self.weight.quant_state, "initialized", True)
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:

From 6860a4ab6a02418eafa48c8d2d3d4f2b15b8c0ba Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 21 Oct 2024 11:25:41 -0400
Subject: [PATCH 08/37] check training

---
 bitsandbytes/nn/modules.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 68050d270..db6cae623 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -468,6 +468,7 @@ def forward(self, x: torch.Tensor):
             and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
+            and not self.training
             and x.requires_grad == False
             and getattr(self.weight.quant_state, "initialized", False) == False
         ):

From b2233b775b7b08e7b613402c7164f2cb583fd690 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 22 Oct 2024 08:41:58 -0400
Subject: [PATCH 09/37] fix gemv check

---
 bitsandbytes/autograd/_functions.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 0abd6b6df..a4d97f44f 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -575,18 +575,22 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
-        # CPU backend does not require A to be a vector
+    if A.device.type == "cpu" and A.requires_grad == False:
+        if getattr(quant_state, "ipex", False):
+            out = F.gemv_4bit(A, B, out, state=quant_state)
+            if bias is not None:
+                out += bias
+            return out
+        else:
+            return MatMul4Bit.apply(A, B, out, bias, quant_state)
+    elif A.numel() == A.shape[-1] and A.requires_grad == False:
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
             )
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
         else:
-            if getattr(quant_state, "ipex", False):
-                out = F.gemv_4bit(A, B, out, state=quant_state)
-            else:
-                out = F.gemv_4bit(A, B.t(), out, state=quant_state)
+            out = F.gemv_4bit(A, B.t(), out, state=quant_state)
             if bias is not None:
                 out += bias
             return out

From dbafcbb77912ab2d085321eb9dffdf7fc302fcd4 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 22 Oct 2024 08:55:16 -0400
Subject: [PATCH 10/37] reformat

---
 bitsandbytes/nn/modules.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index db6cae623..72d9c985e 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -417,6 +417,7 @@ def __init__(
         # self.persistent_buffers = []  # TODO consider as way to save quant state
         self.compute_dtype = compute_dtype
         self.compute_type_is_set = False
+        self.ipex_linear_is_set = False
         self.quant_state = None
         self.quant_storage = quant_storage
 
@@ -461,8 +462,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
 
-    def forward(self, x: torch.Tensor):
-        # Check if ipex fusion can be used
+    def set_ipex_linear(self, x: torch.Tensor):
         if (
             x.device.type == "cpu"
             and not getattr(self.weight.quant_state, "ipex", False)
@@ -470,11 +470,14 @@ def forward(self, x: torch.Tensor):
             and self.weight.quant_state.quant_type == "nf4"
             and not self.training
             and x.requires_grad == False
-            and getattr(self.weight.quant_state, "initialized", False) == False
         ):
             enable_ipex_fusion(self)
-        else:
-            setattr(self.weight.quant_state, "initialized", True)
+
+    def forward(self, x: torch.Tensor):
+        # Check if ipex fusion can be used
+        if not self.ipex_linear_is_set:
+            self.set_ipex_linear(x)
+            self.ipex_linear_is_set = True
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:

From 702b748130ae3024ba28dd32ed600554ef7bdee5 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 8 Nov 2024 07:55:52 +0000
Subject: [PATCH 11/37] avoid double quant in backward if not needed

---
 bitsandbytes/autograd/_functions.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index a4d97f44f..469aef801 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -463,7 +463,9 @@ def backward(ctx, grad_output):
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
-        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
+        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = None, None, None, None, None
+        if req_gradB or (req_gradA and state.CBt):
+            Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
         if req_gradB:
             CxAt, SAt = F.transform(CAt, formatB, transpose=True)
             C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)

From 1bde567d24414fd2450eb99a2425f0ed8a62afa6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 12 Nov 2024 18:40:01 +0800
Subject: [PATCH 12/37] Zh/xpu support (#9)

* Add xpu support

* Add xpu support for int8

* Add xpu dequant kernel support

* update code

* remove debug comments

* remove redundant comments

* Add xpu integration for woqlinear

* correct the comments

* Update cpu_xpu_common.py

---------

Co-authored-by: zhuhong61 <hong.zhu@intel.com>
Co-authored-by: zhuhong61 <95205772+zhuhong61@users.noreply.github.com>
---
 bitsandbytes/autograd/_functions.py     |  4 +-
 bitsandbytes/backends/cpu_xpu_common.py | 11 +--
 bitsandbytes/backends/xpu.py            | 92 ++++++++++++++++++++++---
 bitsandbytes/functional.py              |  2 +-
 bitsandbytes/nn/modules.py              | 26 ++++++-
 bitsandbytes/utils.py                   | 22 ++++--
 6 files changed, 133 insertions(+), 24 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 469aef801..5f11933e8 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -221,7 +221,7 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
-    if device == torch.device("cpu"):
+    if device == torch.device("cpu") or torch.device("xpu"):
         return True
     if torch.version.hip:
         return False if BNB_HIP_VERSION < 601 else True
@@ -321,7 +321,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
 
         # Cast A to fp16
         A_dtype = torch.float16
-        if A.device == torch.device("cpu"):
+        if A.device == torch.device("cpu") or torch.device("xpu"):
             A_dtype = torch.bfloat16
         if A.dtype != A_dtype:
             warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {A_dtype} during quantization")
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 78473bdc4..6dd700e02 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -15,6 +15,7 @@
 
     ipex_cpu = ipex if ipex._C._has_cpu() else None
     ipex_xpu = ipex if ipex._C._has_xpu() else None
+    ipex_cpu_only = ipex._C._has_cpu() and (not ipex._C._has_xpu())
 except BaseException:
     ipex_cpu = None
     ipex_xpu = None
@@ -342,7 +343,7 @@ def quantize_4bit_impl(
         scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
         scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
     # map [-1, 1] to nf4/fp4
-    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
+    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8, device=A.device)
     if quant_type == "nf4":
         for i in range(len(NF4_QUANT_TABLE)):
             out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
@@ -438,7 +439,7 @@ def dequantize_4bit_impl(
     if quant_state.nested:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
+    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
         A = torch.ops.ipex_prepack.woq_linear_unpack_weight(
                 A, "nf4", quant_state.shape, 2
             )
@@ -452,7 +453,9 @@ def dequantize_4bit_impl(
     out_uint8 = torch.empty(A.size(0) * 2, dtype=torch.uint8, device=A.device)
     out_uint8[::2] = A.bitwise_and(0xF)
     out_uint8[1::2] = A.bitwise_right_shift(4)
-    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype)
+    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype).to(A.device)
+    # quant_state.code is fp32, cast to quant_state dtype to avoid the mismatch issue
+    quant_state.code = quant_state.code.to(quant_state.dtype)
     for i in range(len(quant_state.code)):
         out_dq[out_uint8 == i] = quant_state.code[i]
 
@@ -510,7 +513,7 @@ def gemm_4bit_impl(
     torch.Tensor:
         GEMM output tensor.
     """
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 5) and getattr(state, "ipex", False):
+    if (ipex_cpu and _ipex_cpu_version_prereq(2, 5)) or (ipex_xpu and _ipex_xpu_version_prereq(2, 5)) and getattr(state, "ipex", False):
         output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
                     state.new_scales, state.new_zeros, None, None, state.blocksize,
                     ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 3976c4d5a..566d6a39d 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -5,9 +5,34 @@
 from bitsandbytes.utils import QuantState
 
 from .base import Backend
+from .cpu_xpu_common import (
+    dequantize_4bit_impl,
+    double_quant_impl,
+    gemm_4bit_impl,
+    igemmlt_impl,
+    mm_dequant_impl,
+    quantize_4bit_impl,
+)
+
+Tensor = torch.Tensor
+def assert_on_xpu(tensors):
+    on_xpu = True
+    for t in tensors:
+        if t is None:
+            continue  # NULL pointers are fine
+        on_xpu &= t.device.type == "xpu"
+    if not on_xpu:
+        raise TypeError(
+            "All input tensors need to be on XPU, but found some tensors to not be on XPU:\n"
+            f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}"
+        )
+    return on_xpu
 
 
 class XPUBackend(Backend):
+    mm_dequant_compute_dtype = torch.bfloat16
+    mm_dequant_output_dtype = torch.bfloat16
+
     def double_quant(
         self,
         A: torch.Tensor,
@@ -17,7 +42,8 @@ def double_quant(
         out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
     ):
-        raise NotImplementedError
+        assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
+        return double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
 
     def transform(
         self,
@@ -29,7 +55,23 @@ def transform(
         state: Optional[Tuple[torch.Size, str]] = None,
         ld=None,
     ):
-        raise NotImplementedError
+        """
+        Transform tensor A to to_order. It is originally designed for CUDA.
+        For XPU, it returns the original tensor if transpose=False.
+        Otherwise, it returns the transpose of A
+        """
+        assert_on_xpu([A, out])
+        if transpose:
+            if out is not None:
+                out.copy_(A.T)
+            else:
+                out = A.T
+        else:
+            if out is not None:
+                out.copy_(A)
+            else:
+                out = A
+        return out, state
 
     def igemmlt(
         self,
@@ -41,7 +83,8 @@ def igemmlt(
         Sout: Optional[Tuple[torch.Size, str]] = None,
         dtype=torch.int32,
     ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
-        raise NotImplementedError
+        assert_on_xpu([A, B])
+        return igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
 
     def mm_dequant(
         self,
@@ -54,7 +97,19 @@ def mm_dequant(
         new_col_stats: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A, row_stats, col_stats, out, bias])
+        return mm_dequant_impl(
+            A,
+            quant_state,
+            row_stats,
+            col_stats,
+            out,
+            new_row_stats,
+            new_col_stats,
+            bias,
+            self.mm_dequant_compute_dtype,
+            self.mm_dequant_output_dtype,
+        )
 
     def extract_outliers(
         self,
@@ -62,7 +117,9 @@ def extract_outliers(
         SA: Tuple[torch.Size, str],
         idx: torch.Tensor,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A])
+        return A[:, idx].contiguous()
+
 
     def quantize_4bit(
         self,
@@ -74,7 +131,11 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 64
+        assert_on_xpu([A, absmax, out])
+        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
+        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
 
     def dequantize_4bit(
         self,
@@ -85,7 +146,19 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 64
+        assert_on_xpu([A, absmax, out])
+        output_dq = torch.ops.torch_ipex.dequantize_4bit(
+            A,
+            "nf4",
+            quant_state.shape,
+            quant_state.absmax,
+            None,
+            blocksize
+        )
+        output_dq = output_dq.t()
+        return output_dq        
 
     def gemv_4bit(
         self,
@@ -96,7 +169,10 @@ def gemv_4bit(
         transposed_B=False,
         state: QuantState = None,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A, B, out])
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6cf64df28..d486dc474 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1800,7 +1800,7 @@ class COOSparseTensor:
     def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
-        if values.device == torch.device("cpu"):
+        if values.device == torch.device("cpu") or torch.device("xpu"):
             assert values.dtype in [torch.bfloat16, torch.half, torch.float]
         else:
             assert values.dtype == torch.float16
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 72d9c985e..fca9ecb78 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -314,6 +314,9 @@ def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: b
     def cpu(self, non_blocking: bool = False):
         return self.to(device="cpu", non_blocking=non_blocking)
 
+    def xpu(self, non_blocking: bool = False):
+        return self.to(device="xpu", non_blocking=non_blocking)
+
     @overload
     def to(
         self: T,
@@ -331,7 +334,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type in ["cuda", "cpu"] and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu", "xpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:
@@ -464,7 +467,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     def set_ipex_linear(self, x: torch.Tensor):
         if (
-            x.device.type == "cpu"
+            (x.device.type == "cpu" or x.device.type == "xpu")
             and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
@@ -652,6 +655,19 @@ def cpu(self):
         self.SCB = SCB
         return self
 
+    def xpu(self):
+        # we store the 8-bit rows-major weight
+        B = self.data.contiguous().bfloat16().xpu()
+        CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+        if CBt is not None:
+            del CBt
+        if SCBt is not None:
+            del SCBt
+        self.data = CB
+        self.CB = CB
+        self.SCB = SCB
+        return self
+
     @overload
     def to(
         self: T,
@@ -677,6 +693,12 @@ def to(self, *args, **kwargs):
                 return self
             else:
                 return self.cpu()
+        elif device.type == "xpu":
+            if self.data.dtype == torch.int8:
+                self.CB = self.data
+                return self
+            else:
+                return self.xpu()
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index b89edd828..980288b12 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -201,9 +201,10 @@ def unpack_tensor_to_dict(tensor_data):
 
 
 def enable_ipex_fusion(linear):
-    from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
+    from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq, _ipex_xpu_version_prereq
+    from bitsandbytes.backends.cpu_xpu_common import ipex_cpu_only, ipex_xpu
 
-    if _ipex_cpu_version_prereq(2, 5):
+    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight, new_scales, new_zeros, _, compensation = \
                 torch.ops.ipex_prepack.woq_linear_pack_weight(
@@ -217,11 +218,18 @@ def enable_ipex_fusion(linear):
                     quant_state.blocksize,
                     2,
                 )
-        linear.weight.data = new_weight.data
-        setattr(linear.weight.quant_state, "ipex", True)
-        setattr(linear.weight.quant_state, "new_scales", new_scales)
-        setattr(linear.weight.quant_state, "new_zeros", new_zeros)
-        setattr(linear.weight.quant_state, "compensation", compensation)
+    elif ipex_xpu and _ipex_xpu_version_prereq(2, 5):
+        quant_state = linear.weight.quant_state
+        new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
+        
+        new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
+        new_zeros = None
+        compensation = None
+    linear.weight.data = new_weight.data
+    setattr(linear.weight.quant_state, "ipex", True)
+    setattr(linear.weight.quant_state, "new_scales", new_scales)
+    setattr(linear.weight.quant_state, "new_zeros", new_zeros)
+    setattr(linear.weight.quant_state, "compensation", compensation)
 
 
 class QuantState:

From a5d92c3af03ed64dedfd5db6fdb5cf1887886232 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 13 Nov 2024 15:55:07 +0000
Subject: [PATCH 13/37] avoid import triton if CPU and XPU backend

---
 bitsandbytes/__init__.py    |  5 ++++-
 bitsandbytes/nn/__init__.py | 15 +++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 25ec8a79a..3d30f0100 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,7 +21,6 @@
 from .backends.cpu import CPUBackend
 from .backends.npu import NPUBackend
 from .cextension import lib
-from .nn import modules
 
 features = {"multi_backend"}
 supported_torch_devices = {
@@ -64,6 +63,10 @@
 if hasattr(torch, "npu") and torch.npu.is_available():
     register_backend("npu", NPUBackend())
 
+
+# import module after decided backends
+from .nn import modules
+
 # TODO: Other potential backends:
 # XLA - Google TPU / PJRT runtime
 # HPU - Habana / Intel Gaudi
diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index 96f4359bf..dc52e9566 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -14,9 +14,12 @@
     StableEmbedding,
     SwitchBackLinearBnb,
 )
-from .triton_based_modules import (
-    StandardLinear,
-    SwitchBackLinear,
-    SwitchBackLinearGlobal,
-    SwitchBackLinearVectorwise,
-)
+from ..backends import backends
+# CPU and XPU backend do not need triton, and XPU so not support triton for now.
+if "xpu" not in backends.keys() or ("cpu" in backends.keys() and len(backends.keys()) == 1):
+    from .triton_based_modules import (
+        StandardLinear,
+        SwitchBackLinear,
+        SwitchBackLinearGlobal,
+        SwitchBackLinearVectorwise,
+    )

From e7b755b7761316c0af3a331d696821c91b55ae84 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 13 Nov 2024 15:59:36 +0000
Subject: [PATCH 14/37] fix setup in docker without git config

---
 setup.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 2b1c1aff3..ae175fd4a 100644
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,12 @@ def write_version_file(version, filepath="bitsandbytes/_version.py"):
 
 
 def get_version_and_write_to_file():
-    latest_semver_tag = get_latest_semver_tag()
-    version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
+    try:
+        latest_semver_tag = get_latest_semver_tag()
+        version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
+    except:
+        print("Cannot get version by git tag, use 1.0 defaultly")
+        version = "1.0.dev+0"
     write_version_file(version)
     return version
 

From 4d4e240aa00f32a83bf64d8422fa959b33b00cd7 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 13 Nov 2024 16:31:18 +0000
Subject: [PATCH 15/37] xpu do not support compile for now

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 6dd700e02..44213ef7d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -56,7 +56,7 @@ def _ipex_xpu_version_prereq(major, minor):
 
 def _maybe_torch_compile(func):
     # torch.compile requires g++ and pytorch >= 2.0
-    if gxx_available and _torch_version_prereq(2, 0):
+    if gxx_available and _torch_version_prereq(2, 0) and not ipex_xpu:
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):

From 0ccb0b57845ac535bc7280277c629f1039fe1886 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 14 Nov 2024 10:28:31 +0000
Subject: [PATCH 16/37] update xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/autograd/_functions.py     | 2 +-
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 bitsandbytes/nn/modules.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 5f11933e8..78ada7d5e 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -577,7 +577,7 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if A.device.type == "cpu" and A.requires_grad == False:
+    if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
         if getattr(quant_state, "ipex", False):
             out = F.gemv_4bit(A, B, out, state=quant_state)
             if bias is not None:
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 44213ef7d..c1a982e2e 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -513,7 +513,7 @@ def gemm_4bit_impl(
     torch.Tensor:
         GEMM output tensor.
     """
-    if (ipex_cpu and _ipex_cpu_version_prereq(2, 5)) or (ipex_xpu and _ipex_xpu_version_prereq(2, 5)) and getattr(state, "ipex", False):
+    if getattr(state, "ipex", False):
         output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
                     state.new_scales, state.new_zeros, None, None, state.blocksize,
                     ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index fca9ecb78..887ec169e 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -467,7 +467,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     def set_ipex_linear(self, x: torch.Tensor):
         if (
-            (x.device.type == "cpu" or x.device.type == "xpu")
+            (x.device.type in ("cpu", "xpu"))
             and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"

From 712f584bbcaab6f995d7f6288a80a260ddf5ed66 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 14 Nov 2024 12:01:00 +0000
Subject: [PATCH 17/37] update 4bit compute dtype

---
 bitsandbytes/backends/cpu_xpu_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index c1a982e2e..b33b57d8c 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -234,8 +234,8 @@ def mm_dequant_impl(
         out_shape = (out_shape[0] * out_shape[1], out_shape[2])
 
     if compute_dtype not in [torch.float32, torch.bfloat16]:
-        warnings.warn(f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use float instead")
-        compute_dtype = torch.float32
+        warnings.warn(f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use bfloat16 instead")
+        compute_dtype = torch.bfloat16
     A_reshaped = A.reshape(out_shape).to(compute_dtype)
     row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
     col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)

From b58db749bfd1c02553e94f89c7175e5aab7a8c4c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 14 Nov 2024 12:19:06 +0000
Subject: [PATCH 18/37] fix xpu int8 path

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index b33b57d8c..1ff9b0705 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -182,7 +182,7 @@ def igemmlt_impl(A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32)
     A_reshaped = A.reshape(m, k)
 
     # torch._int_mm is available on CPU since torch 2.4
-    if _torch_version_prereq(2, 4):
+    if _torch_version_prereq(2, 4) and A.device.type == "cpu":
         C = torch._int_mm(A_reshaped, B.T).to(dtype)
     else:
         C = torch.matmul(A_reshaped.float(), B.t().float()).to(dtype)

From 0c9015a388ab434059a9330f6fc9334beb44b4d0 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 14 Nov 2024 13:49:28 +0000
Subject: [PATCH 19/37] optimize 4bit dequant

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 29 ++++++++++++-------------
 bitsandbytes/backends/xpu.py            |  4 +++-
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 1ff9b0705..99df9a881 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -409,7 +409,6 @@ def dequantize_4bit_impl(
     torch.Tensor:
         Dequantized tensor.
     """
-
     if A.shape[0] == 1:
         transpose = False
         A = A.squeeze(0)
@@ -445,19 +444,14 @@ def dequantize_4bit_impl(
             )
         quant_state.ipex = False
 
-    if out is None:
-        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
-
-    n = out.numel()
     # Map nf4 to [-1, 1]
-    out_uint8 = torch.empty(A.size(0) * 2, dtype=torch.uint8, device=A.device)
-    out_uint8[::2] = A.bitwise_and(0xF)
-    out_uint8[1::2] = A.bitwise_right_shift(4)
-    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype).to(A.device)
+    out_dq = torch.empty(A.size(0) * 2, dtype=torch.int32, device=A.device)
+    n = out_dq.numel()
+    out_dq[::2] = A & 0xF
+    out_dq[1::2] = A >> 4
     # quant_state.code is fp32, cast to quant_state dtype to avoid the mismatch issue
     quant_state.code = quant_state.code.to(quant_state.dtype)
-    for i in range(len(quant_state.code)):
-        out_dq[out_uint8 == i] = quant_state.code[i]
+    out_dq = quant_state.code[out_dq]
 
     # Apply scales
     if out_dq.numel() != n:
@@ -467,12 +461,17 @@ def dequantize_4bit_impl(
     blocks += 1 if n % blocksize > 0 else 0
     rem = n % blocksize
     has_rem = rem > 0
-    out_reshaped = out.reshape(-1)
-    out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
-        -1
-    )
+
     if has_rem:
+        if out is None:
+            out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+        out_reshaped = out.reshape(-1)
+        out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
+            -1
+        )
         out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
+    else:
+        out = (out_dq.view(-1, blocksize) * absmax.view(-1, 1)).reshape(quant_state.shape).to(quant_state.dtype)
 
     # take transpose here because weight is transposed (again) for computation
     if transpose:
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 566d6a39d..e9b29bff4 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -172,7 +172,9 @@ def gemv_4bit(
         assert_on_xpu([A, B, out])
         if state is None:
             raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
-        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
+        dequant_out = gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
+        torch.xpu.empty_cache()
+        return dequant_out
 
     def dequantize_blockwise(
         self,

From 0e919dc2880f3fe8a3527153ba27f871a130bb79 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 14 Nov 2024 13:51:34 +0000
Subject: [PATCH 20/37] fix xpu dequant

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/xpu.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index e9b29bff4..e9e722aac 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -149,16 +149,9 @@ def dequantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        output_dq = torch.ops.torch_ipex.dequantize_4bit(
-            A,
-            "nf4",
-            quant_state.shape,
-            quant_state.absmax,
-            None,
-            blocksize
-        )
-        output_dq = output_dq.t()
-        return output_dq        
+        dequant_out = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
+        torch.xpu.empty_cache()
+        return dequant_out
 
     def gemv_4bit(
         self,
@@ -172,9 +165,7 @@ def gemv_4bit(
         assert_on_xpu([A, B, out])
         if state is None:
             raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
-        dequant_out = gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
-        torch.xpu.empty_cache()
-        return dequant_out
+        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
 
     def dequantize_blockwise(
         self,

From ee4fd00e0ab949cbcb50e0ad99882defda5a39f8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 14 Nov 2024 16:27:02 +0000
Subject: [PATCH 21/37] add empty cache in each xpu op

---
 bitsandbytes/backends/xpu.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index e9e722aac..04de32292 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -43,7 +43,9 @@ def double_quant(
         threshold=0.0,
     ):
         assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
-        return double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
+        output = double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
+        torch.xpu.empty_cache()
+        return output
 
     def transform(
         self,
@@ -84,7 +86,9 @@ def igemmlt(
         dtype=torch.int32,
     ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
         assert_on_xpu([A, B])
-        return igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
+        torch.xpu.empty_cache()
+        output = igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
+        return output
 
     def mm_dequant(
         self,
@@ -98,7 +102,7 @@ def mm_dequant(
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert_on_xpu([A, row_stats, col_stats, out, bias])
-        return mm_dequant_impl(
+        output = mm_dequant_impl(
             A,
             quant_state,
             row_stats,
@@ -110,6 +114,8 @@ def mm_dequant(
             self.mm_dequant_compute_dtype,
             self.mm_dequant_output_dtype,
         )
+        torch.xpu.empty_cache()
+        return output
 
     def extract_outliers(
         self,
@@ -118,7 +124,9 @@ def extract_outliers(
         idx: torch.Tensor,
     ) -> torch.Tensor:
         assert_on_xpu([A])
-        return A[:, idx].contiguous()
+        output = A[:, idx].contiguous()
+        torch.xpu.empty_cache()
+        return output
 
 
     def quantize_4bit(
@@ -135,7 +143,9 @@ def quantize_4bit(
             blocksize = 64
         assert_on_xpu([A, absmax, out])
         assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
-        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        torch.xpu.empty_cache()
+        return output
 
     def dequantize_4bit(
         self,
@@ -149,9 +159,9 @@ def dequantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        dequant_out = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
+        output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
         torch.xpu.empty_cache()
-        return dequant_out
+        return output
 
     def gemv_4bit(
         self,
@@ -165,7 +175,9 @@ def gemv_4bit(
         assert_on_xpu([A, B, out])
         if state is None:
             raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
-        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
+        output = gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
+        torch.xpu.empty_cache()
+        return output
 
     def dequantize_blockwise(
         self,

From 35b8c91287e1d16e3b197dc0a0bad6b55d03fc84 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 15 Nov 2024 13:30:44 +0000
Subject: [PATCH 22/37] add nf4 dequant ipex kernel

---
 bitsandbytes/backends/xpu.py |  5 ++++-
 bitsandbytes/functional.py   | 16 ++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 04de32292..75d43acdc 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -159,7 +159,10 @@ def dequantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
+        if quant_type == "nf4":
+            output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None,blocksize).t()
+        else:
+            output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
         torch.xpu.empty_cache()
         return output
 
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index d486dc474..1386b3e39 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1006,10 +1006,6 @@ def dequantize_fp4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-    if blocksize is None:
-        # Some AMD GPUs have warpsize 64
-        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
-        blocksize = 64 if not HIP_ENVIRONMENT else 128
 
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
 
@@ -1021,10 +1017,6 @@ def dequantize_nf4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-    if blocksize is None:
-        # Some AMD GPUs have warpsize 64
-        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
-        blocksize = 64 if not HIP_ENVIRONMENT else 128
 
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
 
@@ -1064,6 +1056,14 @@ def dequantize_4bit(
         Dequantized tensor.
     """
     ensure_backend_is_available(A.device.type)
+    if quant_state is not None:
+        absmax = absmax or quant_state.absmax
+        quant_type = quant_type or quant_state.quant_type
+        blocksize = blocksize or quant_state.blocksize
+    if blocksize is None:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
     return backends[A.device.type].dequantize_4bit(
         A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type
     )

From 347524df28f78fc0eead0ff98f62d720a673dc8d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 15 Nov 2024 13:35:04 +0000
Subject: [PATCH 23/37] fix dequant 4bit op

---
 bitsandbytes/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 1386b3e39..c46ac31c6 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1027,7 +1027,7 @@ def dequantize_4bit(
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
-    quant_type="fp4",
+    quant_type=None,
 ) -> Tensor:
     """
     Dequantizes FP4 blockwise quantized values.

From ed0e37030c89eb448b4d7ba3d7b9110b03d560cb Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 15 Nov 2024 13:39:23 +0000
Subject: [PATCH 24/37] empty cache has negative effect on 4bit gemv

---
 bitsandbytes/backends/xpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 75d43acdc..374283127 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -179,7 +179,6 @@ def gemv_4bit(
         if state is None:
             raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
         output = gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
-        torch.xpu.empty_cache()
         return output
 
     def dequantize_blockwise(

From 11db860c834a6b494269a83708a957b78adffeac Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 15 Nov 2024 16:20:59 +0000
Subject: [PATCH 25/37] fix xpu save

---
 bitsandbytes/nn/modules.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 887ec169e..0694c654a 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -453,11 +453,14 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
             getattr(self.weight, "quant_state", None) is not None
             and getattr(self.weight.quant_state, "ipex", False)
         ):
-            original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
-                self.weight, "nf4", self.weight.quant_state.shape, 2
-            )
-            self.weight.data = original_weight.data
-            self.weight.quant_state.ipex = False
+            if self.device.type == "cpu":
+                original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
+                    self.weight, "nf4", self.weight.quant_state.shape, 2
+                )
+                self.weight.data = original_weight.data
+                self.weight.quant_state.ipex = False
+            elif self.device.type == "xpu":
+                self.weight.data = self.weight.data.reshape(1, -1)
 
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 

From 92e8c87669c3362af67ed6d3d24633b3a08545b7 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 15 Nov 2024 16:30:02 +0000
Subject: [PATCH 26/37] fix save

---
 bitsandbytes/nn/modules.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 0694c654a..e35c44f59 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -453,13 +453,13 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
             getattr(self.weight, "quant_state", None) is not None
             and getattr(self.weight.quant_state, "ipex", False)
         ):
-            if self.device.type == "cpu":
+            if self.weight.device.type == "cpu":
                 original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
                     self.weight, "nf4", self.weight.quant_state.shape, 2
                 )
                 self.weight.data = original_weight.data
                 self.weight.quant_state.ipex = False
-            elif self.device.type == "xpu":
+            elif self.weight.device.type == "xpu":
                 self.weight.data = self.weight.data.reshape(1, -1)
 
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias

From cf0a8074c6c92cd2d7de1584f9a92ddb74eb03be Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 15 Nov 2024 16:51:35 +0000
Subject: [PATCH 27/37] xpu use float16 default

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/autograd/_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index f9816775d..9765def05 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -321,7 +321,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
 
         # Cast A to fp16
         A_dtype = torch.float16
-        if A.device == torch.device("cpu") or torch.device("xpu"):
+        if A.device == torch.device("cpu"):
             A_dtype = torch.bfloat16
         if A.dtype != A_dtype:
             warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {A_dtype} during quantization")

From 7a32842b92a60f9c7ae067b441eda2b60f9bb196 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 15 Nov 2024 17:07:55 +0000
Subject: [PATCH 28/37] rm empty cache as it cause slower perf

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/xpu.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 374283127..c9a4ebc12 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -44,7 +44,6 @@ def double_quant(
     ):
         assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
         output = double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
-        torch.xpu.empty_cache()
         return output
 
     def transform(
@@ -86,7 +85,6 @@ def igemmlt(
         dtype=torch.int32,
     ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
         assert_on_xpu([A, B])
-        torch.xpu.empty_cache()
         output = igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
         return output
 
@@ -114,7 +112,6 @@ def mm_dequant(
             self.mm_dequant_compute_dtype,
             self.mm_dequant_output_dtype,
         )
-        torch.xpu.empty_cache()
         return output
 
     def extract_outliers(
@@ -125,7 +122,6 @@ def extract_outliers(
     ) -> torch.Tensor:
         assert_on_xpu([A])
         output = A[:, idx].contiguous()
-        torch.xpu.empty_cache()
         return output
 
 
@@ -144,7 +140,6 @@ def quantize_4bit(
         assert_on_xpu([A, absmax, out])
         assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
         output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
-        torch.xpu.empty_cache()
         return output
 
     def dequantize_4bit(
@@ -163,7 +158,7 @@ def dequantize_4bit(
             output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None,blocksize).t()
         else:
             output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
-        torch.xpu.empty_cache()
+
         return output
 
     def gemv_4bit(

From e636f75d1fff32cf6b9c2c96132d7a3b3f3fc0f5 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 18 Nov 2024 13:09:21 +0000
Subject: [PATCH 29/37] fix xpu save

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/xpu.py | 2 +-
 bitsandbytes/nn/modules.py   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index c9a4ebc12..c332bc40d 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -138,7 +138,7 @@ def quantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
+        assert quant_storage == torch.uint8, "XPU backend only supports uint8 quant_storage"
         output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
         return output
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 68c58050c..b29bb0de2 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -458,10 +458,11 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
                     self.weight, "nf4", self.weight.quant_state.shape, 2
                 )
                 self.weight.data = original_weight.data
-                self.weight.quant_state.ipex = False
             elif self.weight.device.type == "xpu":
                 self.weight.data = self.weight.data.reshape(1, -1)
 
+            self.weight.quant_state.ipex = False
+
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 
         if getattr(self.weight, "quant_state", None) is not None:

From 987423a6d61805127a5cafdfb018b78977b5b778 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 18 Nov 2024 15:09:14 +0000
Subject: [PATCH 30/37] fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 bitsandbytes/nn/modules.py

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
old mode 100644
new mode 100755
index b29bb0de2..0d7ea4034
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -696,7 +696,7 @@ def to(self, *args, **kwargs):
         elif device.type == "xpu":
             if self.data.dtype == torch.int8:
                 self.CB = self.data
-                return self
+                return self.to(device="xpu", non_blocking=non_blocking)
             else:
                 return self.xpu()
         else:

From 9da03f1a07de2a8b658b643c25b4bd0aa266211c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 18 Nov 2024 15:13:02 +0000
Subject: [PATCH 31/37] fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 0d7ea4034..5b3fc823f 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -696,7 +696,7 @@ def to(self, *args, **kwargs):
         elif device.type == "xpu":
             if self.data.dtype == torch.int8:
                 self.CB = self.data
-                return self.to(device="xpu", non_blocking=non_blocking)
+                return super().xpu(device)
             else:
                 return self.xpu()
         else:

From aa3b245fd025124d67164c8ea5c0870df64acedd Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 18 Nov 2024 15:24:30 +0000
Subject: [PATCH 32/37] fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 5b3fc823f..22c258cfd 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -657,7 +657,7 @@ def cpu(self):
 
     def xpu(self):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().bfloat16().xpu()
+        B = self.data.contiguous().float16().xpu()
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt
@@ -695,8 +695,9 @@ def to(self, *args, **kwargs):
                 return self.cpu()
         elif device.type == "xpu":
             if self.data.dtype == torch.int8:
+                self.data = self.data.contiguous().xpu()
                 self.CB = self.data
-                return super().xpu(device)
+                return self
             else:
                 return self.xpu()
         else:

From 1e27a22839467edf50bd7411132c80d891107458 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 18 Nov 2024 15:28:15 +0000
Subject: [PATCH 33/37] fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 22c258cfd..3b09817e8 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -644,7 +644,7 @@ def __deepcopy__(self, memo):
 
     def cpu(self):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().bfloat16().cpu()
+        B = self.data.contiguous().to(torch.bfloat16).cpu()
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt
@@ -657,7 +657,7 @@ def cpu(self):
 
     def xpu(self):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().float16().xpu()
+        B = self.data.contiguous().to(torch.float16).xpu()
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt

From 314f724d68693cf3cfcdb3329fddae8355da4892 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 19 Nov 2024 13:28:30 +0000
Subject: [PATCH 34/37] fix format

---
 bitsandbytes/backends/cpu_xpu_common.py | 31 +++++++++++++------
 bitsandbytes/backends/xpu.py            |  5 +--
 bitsandbytes/functional.py              |  2 --
 bitsandbytes/nn/__init__.py             |  3 +-
 bitsandbytes/nn/modules.py              |  5 +--
 bitsandbytes/utils.py                   | 41 +++++++++++++------------
 6 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 99df9a881..d2e0c2593 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -234,7 +234,9 @@ def mm_dequant_impl(
         out_shape = (out_shape[0] * out_shape[1], out_shape[2])
 
     if compute_dtype not in [torch.float32, torch.bfloat16]:
-        warnings.warn(f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use bfloat16 instead")
+        warnings.warn(
+            f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use bfloat16 instead"
+        )
         compute_dtype = torch.bfloat16
     A_reshaped = A.reshape(out_shape).to(compute_dtype)
     row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
@@ -439,9 +441,7 @@ def dequantize_4bit_impl(
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
     if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
-        A = torch.ops.ipex_prepack.woq_linear_unpack_weight(
-                A, "nf4", quant_state.shape, 2
-            )
+        A = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", quant_state.shape, 2)
         quant_state.ipex = False
 
     # Map nf4 to [-1, 1]
@@ -466,9 +466,9 @@ def dequantize_4bit_impl(
         if out is None:
             out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
         out_reshaped = out.reshape(-1)
-        out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
-            -1
-        )
+        out_reshaped[: n - rem] = (
+            out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)
+        ).reshape(-1)
         out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
     else:
         out = (out_dq.view(-1, blocksize) * absmax.view(-1, 1)).reshape(quant_state.shape).to(quant_state.dtype)
@@ -513,9 +513,20 @@ def gemm_4bit_impl(
         GEMM output tensor.
     """
     if getattr(state, "ipex", False):
-        output = torch.ops.torch_ipex.woq_linear(A, B, "nf4", state.shape,
-                    state.new_scales, state.new_zeros, None, None, state.blocksize,
-                    ipex_cpu.quantization.WoqLowpMode.BF16, 1, state.compensation)
+        output = torch.ops.torch_ipex.woq_linear(
+            A,
+            B,
+            "nf4",
+            state.shape,
+            state.new_scales,
+            state.new_zeros,
+            None,
+            None,
+            state.blocksize,
+            ipex_cpu.quantization.WoqLowpMode.BF16,
+            1,
+            state.compensation,
+        )
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
         output = torch.matmul(A, dqB.to(A.dtype))
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index c332bc40d..bc13963e6 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -15,6 +15,8 @@
 )
 
 Tensor = torch.Tensor
+
+
 def assert_on_xpu(tensors):
     on_xpu = True
     for t in tensors:
@@ -124,7 +126,6 @@ def extract_outliers(
         output = A[:, idx].contiguous()
         return output
 
-
     def quantize_4bit(
         self,
         A: torch.Tensor,
@@ -155,7 +156,7 @@ def dequantize_4bit(
             blocksize = 64
         assert_on_xpu([A, absmax, out])
         if quant_type == "nf4":
-            output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None,blocksize).t()
+            output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None, blocksize).t()
         else:
             output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
 
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index c46ac31c6..3c730cb16 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1006,7 +1006,6 @@ def dequantize_fp4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
 
 
@@ -1017,7 +1016,6 @@ def dequantize_nf4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
 
 
diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index dc52e9566..fe3e83596 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+from ..backends import backends
 from .modules import (
     Embedding,
     Int8Params,
@@ -14,7 +15,7 @@
     StableEmbedding,
     SwitchBackLinearBnb,
 )
-from ..backends import backends
+
 # CPU and XPU backend do not need triton, and XPU so not support triton for now.
 if "xpu" not in backends.keys() or ("cpu" in backends.keys() and len(backends.keys()) == 1):
     from .triton_based_modules import (
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 3b09817e8..2159c21e4 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -449,10 +449,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         save weight and bias,
         then fill state_dict with components of quant_state
         """
-        if (
-            getattr(self.weight, "quant_state", None) is not None
-            and getattr(self.weight.quant_state, "ipex", False)
-        ):
+        if getattr(self.weight, "quant_state", None) is not None and getattr(self.weight.quant_state, "ipex", False):
             if self.weight.device.type == "cpu":
                 original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
                     self.weight, "nf4", self.weight.quant_state.shape, 2
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 980288b12..adb36279c 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -201,35 +201,38 @@ def unpack_tensor_to_dict(tensor_data):
 
 
 def enable_ipex_fusion(linear):
-    from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq, _ipex_xpu_version_prereq
-    from bitsandbytes.backends.cpu_xpu_common import ipex_cpu_only, ipex_xpu
+    from bitsandbytes.backends.cpu_xpu_common import (
+        _ipex_cpu_version_prereq,
+        _ipex_xpu_version_prereq,
+        ipex_cpu_only,
+        ipex_xpu,
+    )
 
     if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
-        new_weight, new_scales, new_zeros, _, compensation = \
-                torch.ops.ipex_prepack.woq_linear_pack_weight(
-                    linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
-                    "nf4",
-                    quant_state.shape,  # weight shape
-                    quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
-                    None,  # zero_points
-                    None,  # bias
-                    None,  # batch_size
-                    quant_state.blocksize,
-                    2,
-                )
+        new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
+            linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+            "nf4",
+            quant_state.shape,  # weight shape
+            quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
+            None,  # zero_points
+            None,  # bias
+            None,  # batch_size
+            quant_state.blocksize,
+            2,
+        )
     elif ipex_xpu and _ipex_xpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
-        
+
         new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
         new_zeros = None
         compensation = None
     linear.weight.data = new_weight.data
-    setattr(linear.weight.quant_state, "ipex", True)
-    setattr(linear.weight.quant_state, "new_scales", new_scales)
-    setattr(linear.weight.quant_state, "new_zeros", new_zeros)
-    setattr(linear.weight.quant_state, "compensation", compensation)
+    linear.weight.quant_state.ipex = True
+    linear.weight.quant_state.new_scales = new_scales
+    linear.weight.quant_state.new_zeros = new_zeros
+    linear.weight.quant_state.compensation = compensation
 
 
 class QuantState:

From 95387c8ce4964031f8045f11529f74132b2312f4 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 20 Nov 2024 15:58:17 +0000
Subject: [PATCH 35/37] update readme for Intel CPU and XPU do not need make
 csrc codes

---
 docs/source/installation.mdx      | 2 --
 docs/source/non_cuda_backends.mdx | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index b38bdb920..615dfd95e 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -336,8 +336,6 @@ The below commands are for Linux. For installing on Windows, please adapt the be
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=cpu -S .
-make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index 728606b7b..4c429fb2d 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -33,12 +33,12 @@ The following performance data is collected from Intel 4th Gen Xeon (SPR) platfo
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.6x | 2.3x | 0.03x |
+| Speed-Up (vs BF16) | 1.0x | 0.44x | 1.8x | 0.1x |
 | Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
 
 #### Fine-Tuning (CPU)
 
-| Data Type | AMP BF16 | INT8 | NF4 | FP4 |
+| Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs AMP BF16) | 1.0x | 0.38x | 0.07x | 0.07x |
+| Speed-Up (vs BF16) | 1.0x | 0.38x | 0.1x | 0.1x |
 | Memory (GB) | 40 | 9 | 6.6 | 6.6 |

From f039cfe0c1fb3a5e5fc9a47593a91b4f15712b1c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 21 Nov 2024 10:27:20 +0000
Subject: [PATCH 36/37] fix format

---
 bitsandbytes/__init__.py | 5 +++--
 setup.py                 | 8 ++------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 3d30f0100..c705137c0 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -17,7 +17,7 @@
     matmul_cublas,
     mm_cublas,
 )
-from .backends import register_backend
+from .backends import backends, register_backend
 from .backends.cpu import CPUBackend
 from .backends.npu import NPUBackend
 from .cextension import lib
@@ -65,7 +65,8 @@
 
 
 # import module after decided backends
-from .nn import modules
+if backends:
+    from .nn import modules
 
 # TODO: Other potential backends:
 # XLA - Google TPU / PJRT runtime
diff --git a/setup.py b/setup.py
index ae175fd4a..2b1c1aff3 100644
--- a/setup.py
+++ b/setup.py
@@ -37,12 +37,8 @@ def write_version_file(version, filepath="bitsandbytes/_version.py"):
 
 
 def get_version_and_write_to_file():
-    try:
-        latest_semver_tag = get_latest_semver_tag()
-        version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
-    except:
-        print("Cannot get version by git tag, use 1.0 defaultly")
-        version = "1.0.dev+0"
+    latest_semver_tag = get_latest_semver_tag()
+    version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
     write_version_file(version)
     return version
 

From bde878a0c85533724c1edfe30d33c82ebbc3b5e3 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 21 Nov 2024 12:42:51 +0000
Subject: [PATCH 37/37] fix import

---
 bitsandbytes/nn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index fe3e83596..35bee393e 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -17,7 +17,7 @@
 )
 
 # CPU and XPU backend do not need triton, and XPU so not support triton for now.
-if "xpu" not in backends.keys() or ("cpu" in backends.keys() and len(backends.keys()) == 1):
+if "xpu" not in backends.keys() and len(backends.keys()) > 1:
     from .triton_based_modules import (
         StandardLinear,
         SwitchBackLinear,