Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion qdp/qdp-core/src/gpu/encodings/amplitude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,28 @@ impl AmplitudeEncoder {
device: &Arc<CudaDevice>,
input_ptr: *const f32,
len: usize,
) -> Result<f32> {
unsafe {
Self::calculate_inv_norm_gpu_f32_with_stream(
device,
input_ptr,
len,
std::ptr::null_mut(),
)
}
}

/// Compute inverse L2 norm on GPU for float32 input on a given stream.
///
/// # Safety
/// The caller must ensure `input_ptr` points to valid GPU memory containing
/// at least `len` f32 elements on the same device as `device`.
#[cfg(target_os = "linux")]
pub unsafe fn calculate_inv_norm_gpu_f32_with_stream(
device: &Arc<CudaDevice>,
input_ptr: *const f32,
len: usize,
stream: *mut c_void,
) -> Result<f32> {
crate::profile_scope!("GPU::NormSingleF32");

Expand All @@ -522,7 +544,7 @@ impl AmplitudeEncoder {
input_ptr,
len,
*norm_buffer.device_ptr_mut() as *mut f32,
std::ptr::null_mut(), // default stream
stream,
)
};

Expand All @@ -534,6 +556,8 @@ impl AmplitudeEncoder {
)));
}

sync_cuda_stream(stream, "Norm stream synchronize failed (f32)")?;

let inv_norm_host = device
.dtoh_sync_copy(&norm_buffer)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to host: {:?}", e)))?;
Expand Down
191 changes: 191 additions & 0 deletions qdp/qdp-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,63 @@ pub use pipeline_runner::{
run_throughput_pipeline,
};

use std::ffi::c_void;
use std::sync::Arc;

use crate::dlpack::DLManagedTensor;
use crate::gpu::get_encoder;
use cudarc::driver::CudaDevice;

#[cfg(target_os = "linux")]
fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) -> Result<()> {
use crate::gpu::cuda_ffi::{
CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes,
cudaPointerGetAttributes,
};

if ptr.is_null() {
return Err(MahoutError::InvalidInput(
"Input GPU pointer is null".to_string(),
));
}

let mut attrs = CudaPointerAttributes {
memory_type: 0,
device: 0,
device_pointer: std::ptr::null_mut(),
host_pointer: std::ptr::null_mut(),
is_managed: 0,
allocation_flags: 0,
};

let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr) };
if ret != 0 {
return Err(MahoutError::InvalidInput(format!(
"cudaPointerGetAttributes failed for input pointer: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}

if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type != CUDA_MEMORY_TYPE_MANAGED
{
return Err(MahoutError::InvalidInput(format!(
"Input pointer is not CUDA device memory (memory_type={})",
attrs.memory_type
)));
}

let device_ordinal = device.ordinal() as i32;
if attrs.device >= 0 && attrs.device != device_ordinal {
return Err(MahoutError::InvalidInput(format!(
"Input pointer device mismatch: pointer on cuda:{}, engine on cuda:{}",
attrs.device, device_ordinal
)));
}

Ok(())
}

/// Main entry point for Mahout QDP
///
/// Manages GPU context and dispatches encoding tasks.
Expand Down Expand Up @@ -418,6 +469,14 @@ impl QdpEngine {
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeFromGpuPtr");

if input_len == 0 {
return Err(MahoutError::InvalidInput(
"Input data cannot be empty".into(),
));
}

validate_cuda_input_ptr(&self.device, input_d)?;

let state_len = 1usize << num_qubits;
let method = encoding_method.to_ascii_lowercase();

Expand Down Expand Up @@ -600,6 +659,130 @@ impl QdpEngine {
}
}

/// Encode from existing GPU pointer (float32 input, amplitude encoding only)
///
/// Zero-copy encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
/// For stream interop use `encode_from_gpu_ptr_f32_with_stream`.
///
/// # Arguments
/// * `input_d` - Device pointer to input data (f32 array on GPU)
/// * `input_len` - Number of f32 elements in the input
/// * `num_qubits` - Number of qubits for encoding
///
/// # Returns
/// DLPack pointer (state vector in engine precision) for zero-copy PyTorch integration.
/// Internal computation is f32; output is converted to [`Precision`] of the engine.
///
/// # Safety
/// The input pointer must:
/// - Point to valid GPU memory on the same device as the engine
/// - Contain at least `input_len` f32 elements
/// - Remain valid for the duration of this call
#[cfg(target_os = "linux")]
pub unsafe fn encode_from_gpu_ptr_f32(
&self,
input_d: *const f32,
input_len: usize,
num_qubits: usize,
) -> Result<*mut DLManagedTensor> {
unsafe {
self.encode_from_gpu_ptr_f32_with_stream(
input_d,
input_len,
num_qubits,
std::ptr::null_mut(),
)
}
}

/// Encode from existing GPU pointer (float32) on a specified CUDA stream.
///
/// # Returns
/// DLPack pointer (state vector in engine precision). Pass null for `stream` to use the default stream.
///
/// # Safety
/// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream pointer
/// must remain valid for the duration of this call.
#[cfg(target_os = "linux")]
pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
&self,
input_d: *const f32,
input_len: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");

if input_len == 0 {
return Err(MahoutError::InvalidInput(
"Input data cannot be empty".into(),
));
}

validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;

let state_len = 1usize << num_qubits;
if input_len > state_len {
return Err(MahoutError::InvalidInput(format!(
"Input size {} exceeds state vector size {} (2^{} qubits)",
input_len, state_len, num_qubits
)));
}

let state_vector = {
crate::profile_scope!("GPU::Alloc");
gpu::GpuStateVector::new(&self.device, num_qubits, Precision::Float32)?
};

let inv_norm = {
crate::profile_scope!("GPU::NormFromPtr");
unsafe {
gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
&self.device,
input_d,
input_len,
stream,
)?
}
};

let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
MahoutError::InvalidInput(
"State vector precision mismatch (expected float32 buffer)".to_string(),
)
})?;

{
crate::profile_scope!("GPU::KernelLaunch");
let ret = unsafe {
qdp_kernels::launch_amplitude_encode_f32(
input_d,
state_ptr as *mut std::ffi::c_void,
input_len,
state_len,
inv_norm,
stream,
)
};

if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Amplitude encode (f32) kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}

{
crate::profile_scope!("GPU::Synchronize");
gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}

let state_vector = state_vector.to_precision(&self.device, self.precision)?;
Ok(state_vector.to_dlpack())
}

/// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
///
/// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
Expand Down Expand Up @@ -671,6 +854,14 @@ impl QdpEngine {
));
}

if sample_size == 0 {
return Err(MahoutError::InvalidInput(
"Sample size cannot be zero".into(),
));
}

validate_cuda_input_ptr(&self.device, input_batch_d)?;

match method.as_str() {
"amplitude" => {
if sample_size == 0 {
Expand Down
8 changes: 7 additions & 1 deletion qdp/qdp-core/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.

/// Creates normalized test data
/// Creates normalized test data (f64)
#[allow(dead_code)] // Used by multiple test modules
pub fn create_test_data(size: usize) -> Vec<f64> {
(0..size).map(|i| (i as f64) / (size as f64)).collect()
}

/// Creates normalized test data (f32)
#[allow(dead_code)]
pub fn create_test_data_f32(size: usize) -> Vec<f32> {
(0..size).map(|i| (i as f32) / (size as f32)).collect()
}
Loading