[QDP] Add zero-copy amplitude encoding from float32 GPU tensors #999

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

guan404ming merged 6 commits into apache:main from viiccwen:feature/encode-from-gpu-ptr-f32

Feb 4, 2026

+512 −66

qdp/qdp-core/src/gpu/encodings/amplitude.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -510,6 +510,28 @@ impl AmplitudeEncoder { @@
             device: &Arc<CudaDevice>,
             input_ptr: *const f32,
             len: usize,
+        ) -> Result<f32> {
+            unsafe {
+                Self::calculate_inv_norm_gpu_f32_with_stream(
+                    device,
+                    input_ptr,
+                    len,
+                    std::ptr::null_mut(),
+                )
+            }
+        }
+        /// Compute inverse L2 norm on GPU for float32 input on a given stream.
+        ///
+        /// # Safety
+        /// The caller must ensure `input_ptr` points to valid GPU memory containing
+        /// at least `len` f32 elements on the same device as `device`.
+        #[cfg(target_os = "linux")]
+        pub unsafe fn calculate_inv_norm_gpu_f32_with_stream(
+            device: &Arc<CudaDevice>,
+            input_ptr: *const f32,
+            len: usize,
+            stream: *mut c_void,
         ) -> Result<f32> {
             crate::profile_scope!("GPU::NormSingleF32");
@@ Expand All / @@ -522,7 +544,7 @@ impl AmplitudeEncoder { @@
                     input_ptr,
                     len,
                     *norm_buffer.device_ptr_mut() as *mut f32,
-                    std::ptr::null_mut(), // default stream
+                    stream,
                 )
             };
@@ Expand All / @@ -534,6 +556,8 @@ impl AmplitudeEncoder { @@
                 )));
             }
+            sync_cuda_stream(stream, "Norm stream synchronize failed (f32)")?;
             let inv_norm_host = device
                 .dtoh_sync_copy(&norm_buffer)
                 .map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to host: {:?}", e)))?;
@@ Expand Down @@

qdp/qdp-core/src/lib.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -45,12 +45,63 @@ pub use pipeline_runner::{ @@
         run_throughput_pipeline,
     };
+    use std::ffi::c_void;
     use std::sync::Arc;
     use crate::dlpack::DLManagedTensor;
     use crate::gpu::get_encoder;
     use cudarc::driver::CudaDevice;
+    #[cfg(target_os = "linux")]
+    fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) -> Result<()> {
+        use crate::gpu::cuda_ffi::{
+            CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes,
+            cudaPointerGetAttributes,
+        };
+        if ptr.is_null() {
+            return Err(MahoutError::InvalidInput(
+                "Input GPU pointer is null".to_string(),
+            ));
+        }
+        let mut attrs = CudaPointerAttributes {
+            memory_type: 0,
+            device: 0,
+            device_pointer: std::ptr::null_mut(),
+            host_pointer: std::ptr::null_mut(),
+            is_managed: 0,
+            allocation_flags: 0,
+        };
+        let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr) };
+        if ret != 0 {
+            return Err(MahoutError::InvalidInput(format!(
+                "cudaPointerGetAttributes failed for input pointer: {} ({})",
+                ret,
+                cuda_error_to_string(ret)
+            )));
+        }
+        if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type != CUDA_MEMORY_TYPE_MANAGED
+        {
+            return Err(MahoutError::InvalidInput(format!(
+                "Input pointer is not CUDA device memory (memory_type={})",
+                attrs.memory_type
+            )));
+        }
+        let device_ordinal = device.ordinal() as i32;
+        if attrs.device >= 0 && attrs.device != device_ordinal {
+            return Err(MahoutError::InvalidInput(format!(
+                "Input pointer device mismatch: pointer on cuda:{}, engine on cuda:{}",
+                attrs.device, device_ordinal
+            )));
+        }
+        Ok(())
+    }
     /// Main entry point for Mahout QDP
     ///
     /// Manages GPU context and dispatches encoding tasks.
@@ Expand Down Expand Up / @@ -418,6 +469,14 @@ impl QdpEngine { @@
         ) -> Result<*mut DLManagedTensor> {
             crate::profile_scope!("Mahout::EncodeFromGpuPtr");
+            if input_len == 0 {
+                return Err(MahoutError::InvalidInput(
+                    "Input data cannot be empty".into(),
+                ));
+            }
+            validate_cuda_input_ptr(&self.device, input_d)?;
             let state_len = 1usize << num_qubits;
             let method = encoding_method.to_ascii_lowercase();
@@ Expand Down Expand Up / @@ -600,6 +659,130 @@ impl QdpEngine { @@
             }
         }
+        /// Encode from existing GPU pointer (float32 input, amplitude encoding only)
+        ///
+        /// Zero-copy encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
+        /// For stream interop use `encode_from_gpu_ptr_f32_with_stream`.
+        ///
+        /// # Arguments
+        /// * `input_d` - Device pointer to input data (f32 array on GPU)
+        /// * `input_len` - Number of f32 elements in the input
+        /// * `num_qubits` - Number of qubits for encoding
+        ///
+        /// # Returns
+        /// DLPack pointer (state vector in engine precision) for zero-copy PyTorch integration.
+        /// Internal computation is f32; output is converted to [`Precision`] of the engine.
+        ///
+        /// # Safety
+        /// The input pointer must:
+        /// - Point to valid GPU memory on the same device as the engine
+        /// - Contain at least `input_len` f32 elements
+        /// - Remain valid for the duration of this call
+        #[cfg(target_os = "linux")]
+        pub unsafe fn encode_from_gpu_ptr_f32(
+            &self,
+            input_d: *const f32,
+            input_len: usize,
+            num_qubits: usize,
+        ) -> Result<*mut DLManagedTensor> {
+            unsafe {
+                self.encode_from_gpu_ptr_f32_with_stream(
+                    input_d,
+                    input_len,
+                    num_qubits,
+                    std::ptr::null_mut(),
+                )
+            }
+        }
+        /// Encode from existing GPU pointer (float32) on a specified CUDA stream.
+        ///
+        /// # Returns
+        /// DLPack pointer (state vector in engine precision). Pass null for `stream` to use the default stream.
+        ///
+        /// # Safety
+        /// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream pointer
+        /// must remain valid for the duration of this call.
+        #[cfg(target_os = "linux")]
+        pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+            &self,
+            input_d: *const f32,
+            input_len: usize,
+            num_qubits: usize,
+            stream: *mut c_void,
+        ) -> Result<*mut DLManagedTensor> {
+            crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");
+            if input_len == 0 {
+                return Err(MahoutError::InvalidInput(
+                    "Input data cannot be empty".into(),
+                ));
+            }
+            validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
+            let state_len = 1usize << num_qubits;
+            if input_len > state_len {
+                return Err(MahoutError::InvalidInput(format!(
+                    "Input size {} exceeds state vector size {} (2^{} qubits)",
+                    input_len, state_len, num_qubits
+                )));
+            }
+            let state_vector = {
+                crate::profile_scope!("GPU::Alloc");
+                gpu::GpuStateVector::new(&self.device, num_qubits, Precision::Float32)?
+            };
+            let inv_norm = {
+                crate::profile_scope!("GPU::NormFromPtr");
+                unsafe {
+                    gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
+                        &self.device,
+                        input_d,
+                        input_len,
+                        stream,
+                    )?
+                }
+            };
+            let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "State vector precision mismatch (expected float32 buffer)".to_string(),
+                )
+            })?;
+            {
+                crate::profile_scope!("GPU::KernelLaunch");
+                let ret = unsafe {
+                    qdp_kernels::launch_amplitude_encode_f32(
+                        input_d,
+                        state_ptr as *mut std::ffi::c_void,
+                        input_len,
+                        state_len,
+                        inv_norm,
+                        stream,
+                    )
+                };
+                if ret != 0 {
+                    return Err(MahoutError::KernelLaunch(format!(
+                        "Amplitude encode (f32) kernel failed with CUDA error code: {} ({})",
+                        ret,
+                        cuda_error_to_string(ret)
+                    )));
+                }
+            }
+            {
+                crate::profile_scope!("GPU::Synchronize");
+                gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+            }
+            let state_vector = state_vector.to_precision(&self.device, self.precision)?;
+            Ok(state_vector.to_dlpack())
+        }
         /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
         ///
         /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
@@ Expand Down Expand Up / @@ -671,6 +854,14 @@ impl QdpEngine { @@
                 ));
             }
+            if sample_size == 0 {
+                return Err(MahoutError::InvalidInput(
+                    "Sample size cannot be zero".into(),
+                ));
+            }
+            validate_cuda_input_ptr(&self.device, input_batch_d)?;
             match method.as_str() {
                 "amplitude" => {
                     if sample_size == 0 {
@@ Expand Down @@

qdp/qdp-core/tests/common/mod.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -14,8 +14,14 @@ @@
     // See the License for the specific language governing permissions and
     // limitations under the License.
-    /// Creates normalized test data
+    /// Creates normalized test data (f64)
     #[allow(dead_code)] // Used by multiple test modules
     pub fn create_test_data(size: usize) -> Vec<f64> {
         (0..size).map(|i| (i as f64) / (size as f64)).collect()
     }
+    /// Creates normalized test data (f32)
+    #[allow(dead_code)]
+    pub fn create_test_data_f32(size: usize) -> Vec<f32> {
+        (0..size).map(|i| (i as f32) / (size as f32)).collect()
+    }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[QDP] Add zero-copy amplitude encoding from float32 GPU tensors #999

Diff view

Diff view

There are no files selected for viewing

Uh oh!