Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/scripts/build-rocm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,33 @@ if [ "${build_os:0:6}" == ubuntu ]; then
&& pip install cmake==3.31.6 \
&& cmake -DCOMPUTE_BACKEND=hip -DCMAKE_BUILD_TYPE=MinSizeRel -DCMAKE_HIP_FLAGS=\"--offload-compress\" -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
&& cmake --build ."
else
bnb_rocm_arch="gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"

pip install ninja cmake==3.31.6

# Install ROCm SDK wheels from repo.radeon.com.
rocm_base_url="https://repo.radeon.com/rocm/windows/rocm-rel-${rocm_version}"
pip install \
"${rocm_base_url}/rocm_sdk_core-${rocm_version}-py3-none-win_amd64.whl" \
"${rocm_base_url}/rocm_sdk_devel-${rocm_version}-py3-none-win_amd64.whl" \
"${rocm_base_url}/rocm_sdk_libraries_custom-${rocm_version}-py3-none-win_amd64.whl" \
"${rocm_base_url}/rocm-${rocm_version}.tar.gz"

# Expand the devel tarball
rocm-sdk init

ROCM_PATH="$(rocm-sdk path --root)"
export ROCM_PATH
export PATH="${ROCM_PATH}/bin:${PATH}"

cmake -G Ninja \
-DCOMPUTE_BACKEND=hip \
-DBNB_ROCM_ARCH="${bnb_rocm_arch}" \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DCMAKE_HIP_FLAGS="--offload-compress" \
-S .
cmake --build .
fi

output_dir="output/${build_os}/${build_arch}"
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,15 @@ jobs:
os: [ubuntu-22.04]
arch: [x86_64]
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1", "7.2.1"]
include:
- os: windows-2025
arch: x86_64
rocm_version: "7.2.1"
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Clean up disk space
if: startsWith(matrix.os, 'ubuntu')
run: |
echo "Disk space before cleanup:"
df -h
Expand All @@ -156,6 +161,9 @@ jobs:

echo "Disk space after cleanup:"
df -h
- name: Setup MSVC
if: startsWith(matrix.os, 'windows')
uses: ilammy/msvc-dev-cmd@v1.13.0
- name: Build C++
run: bash .github/scripts/build-rocm.sh
env:
Expand Down
25 changes: 16 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@ cmake_minimum_required(VERSION 3.22.1)
# On Windows with HIP backend, auto-detect compilers from ROCM_PATH before project()
if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
endif()
if(ROCM_PATH AND NOT DEFINED CMAKE_CXX_COMPILER)
set(CMAKE_CXX_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
endif()
if(ROCM_PATH AND NOT DEFINED CMAKE_HIP_COMPILER)
set(CMAKE_HIP_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
endif()
if(NOT DEFINED HIP_PLATFORM)
set(HIP_PLATFORM "amd" CACHE STRING "HIP Platform")
endif()
# On Windows, the HIP compiler needs explicit paths to find device libraries.
if(ROCM_PATH)
find_path(ROCM_DEVICE_LIB_PATH
Expand All @@ -35,9 +38,9 @@ if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
"${ROCM_PATH}/lib/llvm/amdgcn/bitcode"
NO_DEFAULT_PATH
)
set(CMAKE_HIP_FLAGS "--rocm-path=${ROCM_PATH}")
string(APPEND CMAKE_HIP_FLAGS " --rocm-path=${ROCM_PATH}")
if(ROCM_DEVICE_LIB_PATH)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
string(APPEND CMAKE_HIP_FLAGS " --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
endif()
endif()
endif()
Expand Down Expand Up @@ -357,7 +360,7 @@ endif()
if(BUILD_HIP)
# Determine ROCM_PATH from environment variable, fallback to /opt/rocm on Linux
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
else()
set(ROCM_PATH /opt/rocm)
endif()
Expand Down Expand Up @@ -416,11 +419,15 @@ if(WIN32)
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
endif()
set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
if(MSVC)
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
if(WIN32)
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
if(MSVC)
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
endif()
endif()

set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,18 @@ bitsandbytes has the following minimum requirements for all platforms:
<td>✅</td>
<td>✅</td>
</tr>
<tr>
<td></td>
<td>🟥 AMD GPU <br><code>cuda</code></td>
<td>
RDNA: gfx1100, gfx1101, gfx1102,<br>
gfx1150, gfx1151,<br>
gfx1200, gfx1201
</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
</tr>
<tr>
<td></td>
<td>🟦 Intel GPU <br><code>xpu</code></td>
Expand Down
35 changes: 0 additions & 35 deletions bitsandbytes/cuda_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,38 +109,3 @@ def get_rocm_gpu_arch() -> str:
""",
)
return "unknown"


def get_rocm_warpsize() -> int:
"""Get ROCm warp size."""
logger = logging.getLogger(__name__)
try:
if torch.version.hip:
# On Windows, use hipinfo.exe; on Linux, use rocminfo
if platform.system() == "Windows":
cmd = ["hipinfo.exe"]
# hipinfo.exe output format: "warpSize: 32" or "warpSize: 64"
warp_pattern = r"warpSize:\s+(\d+)"
else:
cmd = ["rocminfo"]
warp_pattern = r"Wavefront Size:\s+([0-9]{2})\(0x[0-9]{2}\)"

result = subprocess.run(cmd, capture_output=True, text=True)
match = re.search(warp_pattern, result.stdout)
if match:
return int(match.group(1))
else:
# default to 64 to be safe
return 64
else:
# nvidia cards always use 32 warp size
return 32
except Exception as e:
logger.error(f"Could not detect ROCm warp size: {e}. Defaulting to 64. (some 4-bit functions may not work!)")
if torch.cuda.is_available():
logger.warning(
"""
ROCm warp size detection failed despite ROCm being available.
""",
)
return 64
5 changes: 4 additions & 1 deletion bitsandbytes/diagnostics/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@
}

CUDA_RUNTIME_LIB_PATTERNS = (
("libamdhip64.so*",)
(
"libamdhip64.so*", # Linux
"amdhip64*.dll", # Windows
)
if HIP_ENVIRONMENT
else (
"cudart64*.dll", # Windows
Expand Down
33 changes: 33 additions & 0 deletions docs/source/errors.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,36 @@ Make sure this path is appended to the `LD_LIBRARY_PATH` so bnb can find the CUD
If this does not fix the issue, please try compilation from source next.

If this does not work, please open an issue and paste the printed environment if you call `make` and the associated error when running bnb.

## Library not found: version mismatch

If you see an error like `Library not found: libbitsandbytes_cuda128.dll` or `libbitsandbytes_rocm72.so`, it means the pre-compiled library version doesn't match the CUDA/ROCm version reported by your PyTorch installation.

The library filename encodes the version: `libbitsandbytes_cuda{major}{minor}` for CUDA, `libbitsandbytes_rocm{major}{minor}` for ROCm. bitsandbytes picks which one to load based on what PyTorch reports:

```python
import torch
print(torch.version.cuda) # e.g. "12.8" -> looks for libbitsandbytes_cuda128
print(torch.version.hip) # e.g. "7.2" -> looks for libbitsandbytes_rocm72
```

This commonly happens when your PyTorch was compiled against a different CUDA/ROCm version than what you have installed on your system. For example, PyTorch built with ROCm 7.2 reports `torch.version.hip = "7.2"` and bitsandbytes looks for `libbitsandbytes_rocm72`, even if your system has a different ROCm version installed.

To resolve this:

1. **Install a matching PyTorch version** that aligns with the pre-compiled libraries shipped in the bitsandbytes wheel.
2. **Override the version at runtime** with an environment variable so bitsandbytes loads a different library:
```bash
# Linux / macOS
export BNB_CUDA_VERSION=128 # or BNB_ROCM_VERSION=72

# Windows (cmd)
set BNB_CUDA_VERSION=128
```
3. **Compile from source** to produce a library matching your exact toolkit version. For ROCm, you can override the library name with `-DROCM_VERSION`:
```bash
cmake -DCOMPUTE_BACKEND=hip -DROCM_VERSION=72 -S . # produces libbitsandbytes_rocm72
```
For CUDA, the version is detected automatically from the CUDA compiler on your PATH and cannot be overridden -- make sure the correct CUDA Toolkit is first on your PATH.

See the [installation guide](installation) for full compile-from-source instructions.
48 changes: 43 additions & 5 deletions docs/source/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,10 @@ pip install -e .

* Support for AMD GPUs is currently in a preview state.
* All features are supported for both consumer RDNA devices and Data Center CDNA products.
* A compatible PyTorch version with AMD ROCm support is required. It is recommended to use the latest stable release. See [PyTorch on ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html) for guidance.
* A compatible PyTorch version with AMD ROCm support is required. It is recommended to use the latest stable release. On Linux, see [PyTorch on ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html) for guidance. On Windows, ROCm-enabled PyTorch wheels are available from:
- [repo.radeon.com/rocm/windows/](https://repo.radeon.com/rocm/windows/) — official AMD releases
- [repo.amd.com/rocm/whl/](https://repo.amd.com/rocm/whl/) — [TheRock](https://github.com/ROCm/TheRock) release builds
- [rocm.nightlies.amd.com/v2](https://rocm.nightlies.amd.com/v2) — TheRock nightly builds

### Installation from PyPI[[rocm-pip]]

Expand All @@ -203,8 +206,7 @@ The currently distributed `bitsandbytes` are built with the following configurat
| **Linux x86-64** | 7.0.2 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
| **Linux x86-64** | 7.1.0 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
| **Linux x86-64** | 7.2.1 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201

**Windows is not currently supported.**
| **Windows x86-64** | 7.2.1 | RDNA: gfx1100, gfx1101, gfx1102, gfx1150, gfx1151, gfx1200, gfx1201

Use `pip` or `uv` to install the latest release:

Expand All @@ -214,12 +216,18 @@ pip install bitsandbytes

### Compile from Source[[rocm-compile]]

bitsandbytes can be compiled from ROCm 6.2 - ROCm 7.2.1.
bitsandbytes can be compiled from ROCm 6.2 - ROCm 7.2.1. See the `CMakeLists.txt` for additional options.

<hfoptions id="rocm-source">
<hfoption id="Linux">

To compile from source, you need CMake >= **3.31.6**.
To compile from source, you need CMake >= **3.31.6** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).

You should also have a ROCm installation (system-wide or via Docker). The current minimum supported version is **6.2**.

```bash
# Install bitsandbytes from source

# Clone bitsandbytes repo
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/

Expand All @@ -230,6 +238,36 @@ make
pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
```

</hfoption>
<hfoption id="Windows">

Compilation on Windows requires Visual Studio with C++ support, CMake, Ninja, and Python >= **3.10**.

Instead of a system-wide ROCm installation, you can use the pip-installable ROCm SDK wheels from [repo.radeon.com](https://repo.radeon.com/rocm/windows/):

```bash
# Install ROCm SDK wheels (adjust version as needed)
pip install ninja cmake
pip install \
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl \
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl \
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl \
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm-7.2.1.tar.gz

# Expand the devel tarball
rocm-sdk init

# Set ROCM_PATH and activate Visual Studio environment, then build
export ROCM_PATH="$(rocm-sdk path --root)"
export PATH="${ROCM_PATH}/bin:${PATH}"
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
cmake -G Ninja -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH="gfx1100" -DCMAKE_BUILD_TYPE=Release -S .
cmake --build . --config Release
pip install .
```
</hfoption>
</hfoptions>

## Preview Wheels[[preview-wheels]]

If you would like to use new features even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_linear4bit.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist

@pytest.mark.skipif(not torch.cuda.is_available(), reason="FSDP requires CUDA")
@pytest.mark.skipif(
not torch.distributed.is_nccl_available(),
not getattr(torch.distributed, "is_nccl_available", lambda: False)(),
reason="FSDP test requires NCCL backend",
)
def test_fsdp_state_dict_save_4bit():
Expand Down
Loading