apache · corepointer · Apr 15, 2021
diff --git a/src/main/cuda/headers/Matrix.h b/src/main/cuda/headers/Matrix.h
@@ -26,19 +26,19 @@ using int32_t = int;
 
 template <typename T>
 struct Matrix {
-	int32_t nnz;
+	uint64_t nnz;
 	uint32_t rows;
 	uint32_t cols;
-
 	uint32_t* row_ptr;
 	uint32_t* col_idx;
 	T* data;
 
 	typedef T value_type;
 
-	explicit Matrix(size_t* jvals) : nnz(jvals[0]), rows(jvals[1]), cols(jvals[2]),
-			row_ptr(reinterpret_cast<uint32_t*>(jvals[3])),
-			col_idx(reinterpret_cast<uint32_t*>((jvals[4]))), data(reinterpret_cast<T*>(jvals[5])) {}
+	explicit Matrix(uint8_t* jvals) : nnz(*reinterpret_cast<uint32_t*>(&jvals[0])),
+		rows(*reinterpret_cast<uint32_t*>(&jvals[8])), cols(*reinterpret_cast<uint32_t*>(&jvals[12])),
+			row_ptr(reinterpret_cast<uint32_t*>(jvals[16])), col_idx(reinterpret_cast<uint32_t*>((jvals[24]))),
+				data(static_cast<T*>(jvals[32])) {}
 };
 
 #ifdef __CUDACC__
@@ -72,7 +72,7 @@ class MatrixAccessor {
 
 	__device__ void init(Matrix<T>* mat) { _mat = mat; }
 
-	__device__ uint32_t& nnz() { return _mat->nnz; }
+	__device__ uint32_t& nnz() { return return _mat->row_ptr == nullptr ? _mat->rows * _mat->cols : _mat->nnz; }
 	__device__ uint32_t cols() { return _mat->cols; }
 	__device__ uint32_t rows() { return _mat->rows; }
 
@@ -133,7 +133,7 @@ class MatrixAccessor {
 
 	//ToDo sparse accessors
 	__device__ uint32_t len_sparse() {
-		return _mat->nnz;
+		return _mat->row_ptr[_mat->rows];
 	}
 
 	__device__ uint32_t pos_sparse(uint32_t rix) {
@@ -227,34 +227,6 @@ class RingBuffer {
 	}
 };
 
-template <typename T, int NUM_B>
-struct SpoofOp {
-	MatrixAccessor<T> a;
-	MatrixAccessor<T> b[NUM_B];
-	MatrixAccessor<T> c;
-	T* scalars;
-	uint32_t grix;
-	T* avals;
-	uint32_t* aix;
-	uint32_t alen;
-
-	SpoofOp(Matrix<T>* A, Matrix<T>* B, Matrix<T>* C, T* scalars, T* tmp_stor, uint32_t grix) :
-			scalars(scalars), grix(grix), avals(A->data), aix(A->col_idx) {
-		a.init(A);
-		c.init(C);
-		alen = a.row_len(grix);
-
-		if(B)
-			for(auto i = 0; i < NUM_B; ++i)
-				b[i].init(&(B[i]));
-	}
-
-//	__device__ Vector<T>& getTempStorage(uint32_t len) {
-//		Vector<T>& vec = temp_rb.next();
-//		tvec.length = len;
-//		return vec;
-//	}
-};
 #endif // __CUDACC_RTC__
 
 #endif //SYSTEMDS_MATRIX_H
diff --git a/src/main/cuda/headers/reduction.cuh b/src/main/cuda/headers/reduction.cuh
@@ -22,6 +22,7 @@
 #define REDUCTION_CUH
 
 using uint = unsigned int;
+
 #include <cuda_runtime.h>
 
 #include "utils.cuh"
@@ -51,7 +52,9 @@ using uint = unsigned int;
  * @param SpoofCellwiseOp		initial value for the reduction variable
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) {
+__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op,
+	SpoofCellwiseOp spoof_op)
+{
 	auto sdata = shared_memory_proxy<T>();
 
 	// perform first level of reduction,
@@ -66,12 +69,9 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 	// number of active thread blocks (via gridDim).  More blocks will result
 	// in a larger gridSize and therefore fewer elements per thread
 	while (i < N) {
-//		printf("tid=%d i=%d N=%d, in->cols()=%d rix=%d\n", threadIdx.x, i, N, in->cols(), i/in->cols());
 		v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / in->cols(), i % in->cols()));
 
 		if (i + blockDim.x < N)	{
-			//__syncthreads();
-			//printf("loop fetch i(%d)+blockDim.x(%d)=%d, in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]);
 			v = reduction_op(v, spoof_op(*(in->vals(i+blockDim.x)), blockDim.x + i, (i+blockDim.x) / in->cols(), (i+blockDim.x) % in->cols()));
 		}
 
@@ -116,40 +116,25 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 		if (blockDim.x >= 64) {
 			smem[tid] = v = reduction_op(v, smem[tid + 32]);
 		}
-//		if(tid<12)
-//			printf("bid=%d tid=%d reduction result: %3.1f\n", blockIdx.x, tid, sdata[tid]);
-
 		if (blockDim.x >= 32) {
 			smem[tid] = v = reduction_op(v, smem[tid + 16]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 16) {
 			smem[tid] = v = reduction_op(v, smem[tid + 8]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 8) {
 			smem[tid] = v = reduction_op(v, smem[tid + 4]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 4) {
 			smem[tid] = v = reduction_op(v, smem[tid + 2]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 2) {
 			smem[tid] = v = reduction_op(v, smem[tid + 1]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 	}
 
 	 // write result for this block to global mem
 	 if (tid == 0) {
-//	 	if(gridDim.x < 10)
-//	 		printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 	 	out->val(0, blockIdx.x) = sdata[0];
 	 }
 }
@@ -174,19 +159,10 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
  * the value before writing it to its final location in global memory for each
  * row
  */
-//template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-//__device__ void ROW_AGG(
-//		T *g_idata, ///< input data stored in device memory (of size rows*cols)
-//		T *g_odata,  ///< output/temporary array store in device memory (of size
-//		/// rows*cols)
-//		uint rows,  ///< rows in input and temporary/output arrays
-//		uint cols,  ///< columns in input and temporary/output arrays
-//		T initialValue,  ///< initial value for the reduction variable
-//		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
-//		SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT,  ReductionOp reduction_op,
-					   SpoofCellwiseOp spoof_op) {
+	SpoofCellwiseOp spoof_op)
+{
 	auto sdata = shared_memory_proxy<T>();
 
 	// one block per row
@@ -199,7 +175,6 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 	uint32_t i = tid;
 	uint block_offset = block * in->cols();
 
-//	T v = initialValue;
 	T v = reduction_op.init();
 	while (i < in->cols()) {
 		v = reduction_op(v, spoof_op(in->val(block_offset + i), i, i / in->cols(), i % in->cols()));
@@ -283,16 +258,8 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void COL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT,  ReductionOp reduction_op,
-						SpoofCellwiseOp spoof_op) {
-//__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols)
-//		T *g_odata,  ///< output/temporary array store in device memory (of size rows*cols)
-//		uint rows,  ///< rows in input and temporary/output arrays
-//		uint cols,  ///< columns in input and temporary/output arrays
-//		T initialValue,  ///< initial value for the reduction variable
-//		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
-//		SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation
-//
-//{
+	SpoofCellwiseOp spoof_op)
+{
 	uint global_tid = blockIdx.x * blockDim.x + threadIdx.x;
 	if (global_tid >= in->cols()) {
 		return;
@@ -315,13 +282,12 @@ __device__ void NO_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N
 	uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x;
 	uint32_t first_idx = gtid * static_cast<uint32_t>(VT);
 	uint32_t last_idx = min(first_idx + static_cast<uint32_t>(VT), N);
+
 	#pragma unroll
 	for(auto i = first_idx; i < last_idx; i++) {
 		T a = in->hasData() ? in->vals(0)[i] : 0;
 		T result = spoof_op(a, i, i / in->cols(), i % in->cols());
 		out->vals(0)[i] = result;
-		//if(i < 4)
-		//	printf("tid=%d in=%4.3f res=%4.3f out=%4.3f r=%d\n", i, in->vals(0)[i], result, out->vals(0)[i], i/in->cols());
 	}
 }
 

diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -77,10 +77,15 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resourc
 	CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
 	ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG, SpoofOperator::AggOp::MAX), func));
 
+	CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(ctx->staging_buffer)), ctx->default_mem_size));
+	CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(ctx->device_buffer)), ctx->default_mem_size));
+	ctx->current_mem_size = ctx->default_mem_size;
 	return reinterpret_cast<size_t>(ctx);
 }
 
 void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]] uint32_t device_id) {
+	cudaFreeHost(ctx->staging_buffer);
+	cudaFree(ctx->device_buffer);
 	delete ctx;
 	// cuda device is handled by jCuda atm
 	//cudaDeviceReset();
@@ -116,15 +121,25 @@ size_t SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const std::s
 
 template<typename T>
 CUfunction SpoofCUDAContext::getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp> &key) {
-	return nullptr;
+	return nullptr; // generic case never used
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<float>(const std::pair<SpoofOperator::AggType,
 		SpoofOperator::AggOp> &key) {
 	return reduction_kernels_f[key];
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<double>(const std::pair<SpoofOperator::AggType,
 		SpoofOperator::AggOp> &key) {
 	return reduction_kernels_d[key];
-}
+}
+
+void SpoofCUDAContext::resize_staging_buffer(size_t size) {
+	cudaFreeHost(staging_buffer);
+	cudaFree(device_buffer);
+	CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(staging_buffer)), size));
+	CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(device_buffer)), size));
+	current_mem_size = size;
+}
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -46,69 +46,43 @@ class SpoofCUDAContext {
 	std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_f;
 	std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_d;
 
-//	double handling_total, compile_total;
-
 	const std::string resource_path;
 	const std::vector<std::string> include_paths;
 
 public:
+	size_t default_mem_size = 1024; // 1kb for hosting data pointers, scalars and some meta info. This default should
+									// not require resizing these buffers in most cases.
+	size_t current_mem_size = 0; // the actual staging buffer size (should be default unless there was a resize)
+	std::byte* staging_buffer{}; // pinned host mem for async transfers
+	std::byte* device_buffer{};  // this buffer holds the pointers to the data buffers
 
 	explicit SpoofCUDAContext(const char* resource_path_, std::vector<std::string>  include_paths_) : reductions(nullptr),
-			resource_path(resource_path_), include_paths(std::move(include_paths_))
-			//,handling_total(0.0), compile_total(0.0)
-			{}
+			resource_path(resource_path_), include_paths(std::move(include_paths_)) { }
 
 	static size_t initialize_cuda(uint32_t device_id, const char* resource_path_);
 
 	static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
-	
+
 	size_t compile(std::unique_ptr<SpoofOperator> op, const std::string &src);
-	
+
 	template <typename T, typename CALL>
-	int launch(uint32_t opID, std::vector<Matrix<T>>& input, std::vector<Matrix<T>>& sides, Matrix<T>& output,
-			T* scalars, uint32_t grix) {
-		// dp holds in/side/out/scalar pointers for GPU
-		DevMatPtrs<T> dp;
-
-		SpoofOperator* op = compiled_ops[opID].get();
-
-		CHECK_CUDART(cudaMalloc((void **)&dp.in, sizeof(Matrix<T>) * input.size()));
-		CHECK_CUDART(cudaMemcpy(dp.in, reinterpret_cast<void*>(&input[0]), sizeof(Matrix<T>) * input.size(),
-				cudaMemcpyHostToDevice));
-
-		if (!sides.empty()) {
-			CHECK_CUDART(cudaMalloc(reinterpret_cast<void **>(&dp.sides), sizeof(Matrix<T>) * sides.size()));
-			CHECK_CUDART(cudaMemcpy(dp.sides, &sides[0], sizeof(Matrix<T>)  * sides.size(), cudaMemcpyHostToDevice));
-		}
-
-		if (op->isSparseSafe() && input.front().row_ptr != nullptr) {
-			CHECK_CUDART(cudaMemcpy(output.row_ptr, input.front().row_ptr, (input.front().rows+1)*sizeof(uint32_t),
-					cudaMemcpyDeviceToDevice));
-		}
-#ifndef NDEBUG
-		std::cout << "output rows: " << output.rows << " cols: " << output.cols << " nnz: " << output.nnz << " format: " <<
-				(output.row_ptr == nullptr ? "dense" : "sparse") << std::endl;
-#endif
-		size_t out_num_elements = output.rows * output.cols;
-		if(output.row_ptr)
-			if(op->isSparseSafe() && output.nnz > 0)
-				out_num_elements = output.nnz;
-		CHECK_CUDART(cudaMalloc((void **) &dp.out, sizeof(Matrix<T>)));
-		CHECK_CUDART(cudaMemset(output.data, 0, out_num_elements * sizeof(T)));
-		CHECK_CUDART(cudaMemcpy(dp.out, reinterpret_cast<void *>(&output), sizeof(Matrix<T>),
-				cudaMemcpyHostToDevice));
-
-		dp.scalars = scalars;
-
-		CALL::exec(this, op, input, sides, output, grix, dp);
-
+	int launch() {
+
+		DataBufferWrapper dbw(staging_buffer, device_buffer);
+		SpoofOperator* op = compiled_ops[dbw.op_id()].get();
+		dbw.toDevice(op->stream);
+
+		CALL::exec(this, op, &dbw);
+
 		return 0;
 	}
 
 	std::string getOperatorName(uint32_t opID) { return compiled_ops.at(opID)->name; }
 
 	template<typename T>
 	CUfunction getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>& key);
+
+	void resize_staging_buffer(size_t size);
 };
 
 #endif // SPOOFCUDACONTEXT_H