diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index dd44beb5b..77bd2eaf5 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -21,6 +21,33 @@ if [ "${build_os:0:6}" == ubuntu ]; then
       && pip install cmake==3.31.6 \
       && cmake -DCOMPUTE_BACKEND=hip -DCMAKE_BUILD_TYPE=MinSizeRel -DCMAKE_HIP_FLAGS=\"--offload-compress\" -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
+else
+    bnb_rocm_arch="gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
+
+    pip install ninja cmake==3.31.6
+
+    # Install ROCm SDK wheels from repo.radeon.com.
+    rocm_base_url="https://repo.radeon.com/rocm/windows/rocm-rel-${rocm_version}"
+    pip install \
+        "${rocm_base_url}/rocm_sdk_core-${rocm_version}-py3-none-win_amd64.whl" \
+        "${rocm_base_url}/rocm_sdk_devel-${rocm_version}-py3-none-win_amd64.whl" \
+        "${rocm_base_url}/rocm_sdk_libraries_custom-${rocm_version}-py3-none-win_amd64.whl" \
+        "${rocm_base_url}/rocm-${rocm_version}.tar.gz"
+
+    # Expand the devel tarball
+    rocm-sdk init
+
+    ROCM_PATH="$(rocm-sdk path --root)"
+    export ROCM_PATH
+    export PATH="${ROCM_PATH}/bin:${PATH}"
+
+    cmake -G Ninja \
+        -DCOMPUTE_BACKEND=hip \
+        -DBNB_ROCM_ARCH="${bnb_rocm_arch}" \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DCMAKE_HIP_FLAGS="--offload-compress" \
+        -S .
+    cmake --build .
 fi
 
 output_dir="output/${build_os}/${build_arch}"
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index a6b338d65..20fb21336 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -137,10 +137,15 @@ jobs:
         os: [ubuntu-22.04]
         arch: [x86_64]
         rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1", "7.2.1"]
+        include:
+          - os: windows-2025
+            arch: x86_64
+            rocm_version: "7.2.1"
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - name: Clean up disk space
+        if: startsWith(matrix.os, 'ubuntu')
         run: |
           echo "Disk space before cleanup:"
           df -h
@@ -156,6 +161,9 @@ jobs:
 
           echo "Disk space after cleanup:"
           df -h
+      - name: Setup MSVC
+        if: startsWith(matrix.os, 'windows')
+        uses: ilammy/msvc-dev-cmd@v1.13.0
       - name: Build C++
         run: bash .github/scripts/build-rocm.sh
         env:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 952d5d4ea..39473eff1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ cmake_minimum_required(VERSION 3.22.1)
 # On Windows with HIP backend, auto-detect compilers from ROCM_PATH before project()
 if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
     if(DEFINED ENV{ROCM_PATH})
-        set(ROCM_PATH $ENV{ROCM_PATH})
+        file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
     endif()
     if(ROCM_PATH AND NOT DEFINED CMAKE_CXX_COMPILER)
         set(CMAKE_CXX_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
@@ -27,6 +27,9 @@ if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
     if(ROCM_PATH AND NOT DEFINED CMAKE_HIP_COMPILER)
         set(CMAKE_HIP_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
     endif()
+    if(NOT DEFINED HIP_PLATFORM)
+        set(HIP_PLATFORM "amd" CACHE STRING "HIP Platform")
+    endif()
     # On Windows, the HIP compiler needs explicit paths to find device libraries.
     if(ROCM_PATH)
         find_path(ROCM_DEVICE_LIB_PATH
@@ -35,9 +38,9 @@ if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
                   "${ROCM_PATH}/lib/llvm/amdgcn/bitcode"
             NO_DEFAULT_PATH
         )
-        set(CMAKE_HIP_FLAGS "--rocm-path=${ROCM_PATH}")
+        string(APPEND CMAKE_HIP_FLAGS " --rocm-path=${ROCM_PATH}")
         if(ROCM_DEVICE_LIB_PATH)
-            set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
+            string(APPEND CMAKE_HIP_FLAGS " --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
         endif()
     endif()
 endif()
@@ -357,7 +360,7 @@ endif()
 if(BUILD_HIP)
     # Determine ROCM_PATH from environment variable, fallback to /opt/rocm on Linux
     if(DEFINED ENV{ROCM_PATH})
-      set(ROCM_PATH $ENV{ROCM_PATH})
+      file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
     else()
       set(ROCM_PATH /opt/rocm)
     endif()
@@ -416,11 +419,15 @@ if(WIN32)
     set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
 endif()
 set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
-if(MSVC)
-    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
-    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
-    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
-    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
+if(WIN32)
+    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
+    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
+    if(MSVC)
+        set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
+        set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
+        set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
+        set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
+    endif()
 endif()
 
 set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
diff --git a/README.md b/README.md
index 1315f164f..b4fd29b3a 100644
--- a/README.md
+++ b/README.md
@@ -133,6 +133,18 @@ bitsandbytes has the following minimum requirements for all platforms:
       <td>✅</td>
       <td>✅</td>
     </tr>
+    <tr>
+      <td></td>
+      <td>🟥 AMD GPU <br><code>cuda</code></td>
+      <td>
+        RDNA: gfx1100, gfx1101, gfx1102,<br>
+        gfx1150, gfx1151,<br>
+        gfx1200, gfx1201
+      </td>
+      <td>✅</td>
+      <td>✅</td>
+      <td>✅</td>
+    </tr>
     <tr>
       <td></td>
       <td>🟦 Intel GPU <br><code>xpu</code></td>
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index f9fdd295d..25ce3cd1e 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -109,38 +109,3 @@ def get_rocm_gpu_arch() -> str:
                 """,
             )
         return "unknown"
-
-
-def get_rocm_warpsize() -> int:
-    """Get ROCm warp size."""
-    logger = logging.getLogger(__name__)
-    try:
-        if torch.version.hip:
-            # On Windows, use hipinfo.exe; on Linux, use rocminfo
-            if platform.system() == "Windows":
-                cmd = ["hipinfo.exe"]
-                # hipinfo.exe output format: "warpSize: 32" or "warpSize: 64"
-                warp_pattern = r"warpSize:\s+(\d+)"
-            else:
-                cmd = ["rocminfo"]
-                warp_pattern = r"Wavefront Size:\s+([0-9]{2})\(0x[0-9]{2}\)"
-
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            match = re.search(warp_pattern, result.stdout)
-            if match:
-                return int(match.group(1))
-            else:
-                # default to 64 to be safe
-                return 64
-        else:
-            # nvidia cards always use 32 warp size
-            return 32
-    except Exception as e:
-        logger.error(f"Could not detect ROCm warp size: {e}. Defaulting to 64. (some 4-bit functions may not work!)")
-        if torch.cuda.is_available():
-            logger.warning(
-                """
-ROCm warp size detection failed despite ROCm being available.
-                """,
-            )
-        return 64
diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py
index de4d036cb..0b38e2f72 100644
--- a/bitsandbytes/diagnostics/cuda.py
+++ b/bitsandbytes/diagnostics/cuda.py
@@ -32,7 +32,10 @@
 }
 
 CUDA_RUNTIME_LIB_PATTERNS = (
-    ("libamdhip64.so*",)
+    (
+        "libamdhip64.so*",  # Linux
+        "amdhip64*.dll",  # Windows
+    )
     if HIP_ENVIRONMENT
     else (
         "cudart64*.dll",  # Windows
diff --git a/docs/source/errors.mdx b/docs/source/errors.mdx
index 95594ea11..d232a8c5b 100644
--- a/docs/source/errors.mdx
+++ b/docs/source/errors.mdx
@@ -20,3 +20,36 @@ Make sure this path is appended to the `LD_LIBRARY_PATH` so bnb can find the CUD
 If this does not fix the issue, please try compilation from source next.
 
 If this does not work, please open an issue and paste the printed environment if you call `make` and the associated error when running bnb.
+
+## Library not found: version mismatch
+
+If you see an error like `Library not found: libbitsandbytes_cuda128.dll` or `libbitsandbytes_rocm72.so`, it means the pre-compiled library version doesn't match the CUDA/ROCm version reported by your PyTorch installation.
+
+The library filename encodes the version: `libbitsandbytes_cuda{major}{minor}` for CUDA, `libbitsandbytes_rocm{major}{minor}` for ROCm. bitsandbytes picks which one to load based on what PyTorch reports:
+
+```python
+import torch
+print(torch.version.cuda)  # e.g. "12.8" -> looks for libbitsandbytes_cuda128
+print(torch.version.hip)   # e.g. "7.2"  -> looks for libbitsandbytes_rocm72
+```
+
+This commonly happens when your PyTorch was compiled against a different CUDA/ROCm version than what you have installed on your system. For example, PyTorch built with ROCm 7.2 reports `torch.version.hip = "7.2"` and bitsandbytes looks for `libbitsandbytes_rocm72`, even if your system has a different ROCm version installed.
+
+To resolve this:
+
+1. **Install a matching PyTorch version** that aligns with the pre-compiled libraries shipped in the bitsandbytes wheel.
+2. **Override the version at runtime** with an environment variable so bitsandbytes loads a different library:
+   ```bash
+   # Linux / macOS
+   export BNB_CUDA_VERSION=128   # or BNB_ROCM_VERSION=72
+
+   # Windows (cmd)
+   set BNB_CUDA_VERSION=128
+   ```
+3. **Compile from source** to produce a library matching your exact toolkit version. For ROCm, you can override the library name with `-DROCM_VERSION`:
+   ```bash
+   cmake -DCOMPUTE_BACKEND=hip -DROCM_VERSION=72 -S .    # produces libbitsandbytes_rocm72
+   ```
+   For CUDA, the version is detected automatically from the CUDA compiler on your PATH and cannot be overridden -- make sure the correct CUDA Toolkit is first on your PATH.
+
+   See the [installation guide](installation) for full compile-from-source instructions.
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 23f239309..6ff11e6c3 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -187,7 +187,10 @@ pip install -e .
 
 * Support for AMD GPUs is currently in a preview state.
 * All features are supported for both consumer RDNA devices and Data Center CDNA products.
-* A compatible PyTorch version with AMD ROCm support is required. It is recommended to use the latest stable release. See [PyTorch on ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html) for guidance.
+* A compatible PyTorch version with AMD ROCm support is required. It is recommended to use the latest stable release. On Linux, see [PyTorch on ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html) for guidance. On Windows, ROCm-enabled PyTorch wheels are available from:
+  - [repo.radeon.com/rocm/windows/](https://repo.radeon.com/rocm/windows/) — official AMD releases
+  - [repo.amd.com/rocm/whl/](https://repo.amd.com/rocm/whl/) — [TheRock](https://github.com/ROCm/TheRock) release builds
+  - [rocm.nightlies.amd.com/v2](https://rocm.nightlies.amd.com/v2) — TheRock nightly builds
 
 ### Installation from PyPI[[rocm-pip]]
 
@@ -203,8 +206,7 @@ The currently distributed `bitsandbytes` are built with the following configurat
 | **Linux x86-64**   | 7.0.2    | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
 | **Linux x86-64**   | 7.1.0    | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
 | **Linux x86-64**   | 7.2.1    | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
-
-**Windows is not currently supported.**
+| **Windows x86-64** | 7.2.1    | RDNA: gfx1100, gfx1101, gfx1102, gfx1150, gfx1151, gfx1200, gfx1201
 
 Use `pip` or `uv` to install the latest release:
 
@@ -214,12 +216,18 @@ pip install bitsandbytes
 
 ### Compile from Source[[rocm-compile]]
 
-bitsandbytes can be compiled from ROCm 6.2 - ROCm 7.2.1.
+bitsandbytes can be compiled from ROCm 6.2 - ROCm 7.2.1. See the `CMakeLists.txt` for additional options.
+
+<hfoptions id="rocm-source">
+<hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.31.6**.
+To compile from source, you need CMake >= **3.31.6** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+
+You should also have a ROCm installation (system-wide or via Docker). The current minimum supported version is **6.2**.
 
 ```bash
 # Install bitsandbytes from source
+
 # Clone bitsandbytes repo
 git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
@@ -230,6 +238,36 @@ make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
+</hfoption>
+<hfoption id="Windows">
+
+Compilation on Windows requires Visual Studio with C++ support, CMake, Ninja, and Python >= **3.10**.
+
+Instead of a system-wide ROCm installation, you can use the pip-installable ROCm SDK wheels from [repo.radeon.com](https://repo.radeon.com/rocm/windows/):
+
+```bash
+# Install ROCm SDK wheels (adjust version as needed)
+pip install ninja cmake
+pip install \
+    https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl \
+    https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl \
+    https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl \
+    https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm-7.2.1.tar.gz
+
+# Expand the devel tarball
+rocm-sdk init
+
+# Set ROCM_PATH and activate Visual Studio environment, then build
+export ROCM_PATH="$(rocm-sdk path --root)"
+export PATH="${ROCM_PATH}/bin:${PATH}"
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+cmake -G Ninja -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH="gfx1100" -DCMAKE_BUILD_TYPE=Release -S .
+cmake --build . --config Release
+pip install .
+```
+</hfoption>
+</hfoptions>
+
 ## Preview Wheels[[preview-wheels]]
 
 If you would like to use new features even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index d43656b63..5e9e4e49c 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -502,7 +502,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="FSDP requires CUDA")
 @pytest.mark.skipif(
-    not torch.distributed.is_nccl_available(),
+    not getattr(torch.distributed, "is_nccl_available", lambda: False)(),
     reason="FSDP test requires NCCL backend",
 )
 def test_fsdp_state_dict_save_4bit():