From bbafcc3de04a6de4c1950ca098d2fa2a432ecba5 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 14 Nov 2025 10:53:23 -0500 Subject: [PATCH 1/2] CUDA: Drop Maxwell compatibility --- csrc/common.cuh | 4 ---- csrc/kernels.cu | 20 ++++++-------------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/csrc/common.cuh b/csrc/common.cuh index d454caa0e..9e245fcd6 100644 --- a/csrc/common.cuh +++ b/csrc/common.cuh @@ -2,9 +2,6 @@ // TODO: Let's make some of these constexpr and put in a namespace. -#define BNB_CC_MAXWELL 500 -#define BNB_CC_MAXWELL2 520 -#define BNB_CC_MAXWELL2_X1 530 #define BNB_CC_PASCAL 600 #define BNB_CC_PASCAL_X2 620 #define BNB_CC_VOLTA 700 @@ -17,7 +14,6 @@ #define BNB_CC_HOPPER 900 #define BNB_CC_BLACKWELL 1000 -#define BNB_FP16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1) #define BNB_FP16_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA) #define BNB_INT8_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER) #define BNB_BF16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_AMPERE) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index f36f4a7e7..804e9db40 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -1767,15 +1767,7 @@ template __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols) { - // For sm50/sm52 and CUDA < 12.2 we need to do the reduction in fp32. - // Otherwise `T` is `fp16`. This can be removed when Maxwell is dropped. -#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR >= 2) || BNB_FP16_AVAILABLE - using TReduction = T; -#else - using TReduction = float; -#endif - - using BlockReduceT = cub::BlockReduce; + using BlockReduceT = cub::BlockReduce; // One block per row. // Threads load column values in a striped arrangement. @@ -1785,27 +1777,27 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ // We then do a blockwise reduction to determine the row's absmax. __shared__ typename BlockReduceT::TempStorage temp_storage; - __shared__ TReduction smem_row_absmax; + __shared__ T smem_row_absmax; const int row_id = blockIdx.x; const T* row_data = A + (row_id * cols); // Threads will read the row values in a striped access pattern and find a local absmax. - TReduction row_local_absmax = -FLT_MIN; + T row_local_absmax = -FLT_MIN; for (int i = threadIdx.x; i < cols; i += THREADS) { - const TReduction absval = fabsf(__ldcs(&(row_data[i]))); + const T absval = fabsf(__ldcs(&(row_data[i]))); // For sparse decomposition, values outside of the threshold are not to be // included when calculating the row's absmax. if constexpr (SPARSE_DECOMP) { - row_local_absmax = fmaxf(row_local_absmax, absval < TReduction(threshold) ? absval : row_local_absmax); + row_local_absmax = fmaxf(row_local_absmax, absval < T(threshold) ? absval : row_local_absmax); } else { row_local_absmax = fmaxf(row_local_absmax, absval); } } // Reduce thread-local absmax across the block. - const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols); + const T row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols); if (threadIdx.x == 0) { // Save our block's absmax to shared memory for the quantization step. rowStats[row_id] = smem_row_absmax = row_absmax; From a93f49e8dddfbcdb018f2441a05be98f7d5349db Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 14 Nov 2025 10:54:53 -0500 Subject: [PATCH 2/2] Update docs --- docs/source/installation.mdx | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 5ca3145d5..c125eff48 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -40,12 +40,6 @@ The library can be built using CUDA Toolkit versions as old as **11.8**. | NF4/FP4 quantization | 6.0+ | Pascal (GTX 10X0 series, P100) or newer GPUs| -> [!WARNING] -> Support for Maxwell GPUs is deprecated and will be removed in a future release. -> Maxwell support is not included in PyPI distributions from `v0.48.0` on and must be built from source. -> For the best results, a Turing generation device or newer is recommended. - - ### Installation via PyPI[[cuda-pip]] This is the most straightforward and recommended installation option. @@ -81,7 +75,7 @@ For Linux and Windows systems, compiling from source allows you to customize the -To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 9 or newer. +To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 11 or newer. For example, to install a compiler and CMake on Ubuntu: @@ -133,7 +127,7 @@ The currently distributed `bitsandbytes` packages are built with the following c | **OS** | **oneAPI Toolkit** | **Kernel Implementation** | |--------------------|------------------|----------------------| | **Linux x86-64** | 2025.1.3 | SYCL + Triton | -| **Windows x86-64** | N/A | SYCL | +| **Windows x86-64** | 2025.1.3 | SYCL + Triton | The Linux build has a minimum glibc version of 2.34. @@ -197,12 +191,11 @@ pip install -e . The currently distributed preview `bitsandbytes` are built with the following configurations: | **OS** | **ROCm** | **Targets** -|--------------------|----------|---------------------------| -| **Linux x86-64** | 6.1.2 | gfx90a / gfx942 / gfx1100 -| **Linux x86-64** | 6.2.4 | gfx90a / gfx942 / gfx1100 -| **Linux x86-64** | 6.3.4 | gfx90a / gfx942 / gfx1100 -| **Linux x86-64** | 6.4.4 | gfx90a / gfx942 / gfx1100 -| **Linux x86-64** | 7.0.0 | gfx90a / gfx942 / gfx1100 +|--------------------|----------|---------------------------------------------------------------------| +| **Linux x86-64** | 6.2.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101 +| **Linux x86-64** | 6.3.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101 +| **Linux x86-64** | 6.4.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101, gfx1200, gfx1201 +| **Linux x86-64** | 7.0.2 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100 / gfx1101 / gfx1200 / gfx1201 **Windows is not currently supported.**