From 16bdb3ffcb0324e4ac83f7a6f4cfd2ba38755fd6 Mon Sep 17 00:00:00 2001
From: Mark Dokter <mark@dokter.cc>
Date: Thu, 15 Apr 2021 15:50:02 +0200
Subject: [PATCH] [SYSTEMDS-3024] Improve performance by batching data
 descriptor transfers

The spoof cuda operators do several little cudaMemcpy() invocations per operator execution. By transferring all data in one go the overhead can be reduced. In addition, using asynchronous copies can further improve things and are a first step towards using more asynchronicity in the GPU operations.
---
 src/main/cuda/headers/Matrix.h                |  42 +---
 src/main/cuda/headers/reduction.cuh           |  52 +----
 .../cuda/spoof-launcher/SpoofCUDAContext.cpp  |  19 +-
 .../cuda/spoof-launcher/SpoofCUDAContext.h    |  62 ++----
 src/main/cuda/spoof-launcher/SpoofCellwise.h  |  94 ++++----
 src/main/cuda/spoof-launcher/SpoofOperator.h  | 128 +++++++----
 src/main/cuda/spoof-launcher/SpoofRowwise.h   |  38 ++--
 src/main/cuda/spoof-launcher/jni_bridge.cpp   | 177 +++++++--------
 src/main/cuda/spoof-launcher/jni_bridge.h     |  57 ++---
 src/main/cuda/spoof/cellwise.cu               |   3 +-
 src/main/cuda/spoof/rowwise.cu                |   2 +
 .../sysds/hops/codegen/cplan/CNodeCell.java   |  15 +-
 .../sysds/hops/codegen/cplan/cuda/Unary.java  |   4 +-
 .../runtime/codegen/SpoofCUDACellwise.java    | 116 ++++------
 .../runtime/codegen/SpoofCUDAOperator.java    | 205 ++++++++----------
 .../runtime/codegen/SpoofCUDARowwise.java     |  80 +++----
 .../sysds/runtime/codegen/SpoofOperator.java  |   3 +
 .../gpu/SpoofCUDAInstruction.java             |  21 +-
 .../instructions/gpu/context/GPUObject.java   |   3 +-
 19 files changed, 505 insertions(+), 616 deletions(-)
diff --git a/src/main/cuda/headers/Matrix.h b/src/main/cuda/headers/Matrix.h
index 0446983d6bd..61ef939b837 100644
--- a/src/main/cuda/headers/Matrix.h
+++ b/src/main/cuda/headers/Matrix.h
@@ -26,19 +26,19 @@ using int32_t = int;
 
 template <typename T>
 struct Matrix {
-	int32_t nnz;
+	uint64_t nnz;
 	uint32_t rows;
 	uint32_t cols;
-	
 	uint32_t* row_ptr;
 	uint32_t* col_idx;
 	T* data;
 	
 	typedef T value_type;
 	
-	explicit Matrix(size_t* jvals) : nnz(jvals[0]), rows(jvals[1]), cols(jvals[2]),
-			row_ptr(reinterpret_cast<uint32_t*>(jvals[3])),
-			col_idx(reinterpret_cast<uint32_t*>((jvals[4]))), data(reinterpret_cast<T*>(jvals[5])) {}
+	explicit Matrix(uint8_t* jvals) : nnz(*reinterpret_cast<uint32_t*>(&jvals[0])),
+		rows(*reinterpret_cast<uint32_t*>(&jvals[8])), cols(*reinterpret_cast<uint32_t*>(&jvals[12])),
+			row_ptr(reinterpret_cast<uint32_t*>(jvals[16])), col_idx(reinterpret_cast<uint32_t*>((jvals[24]))),
+				data(static_cast<T*>(jvals[32])) {}
 };
 
 #ifdef __CUDACC__
@@ -72,7 +72,7 @@ class MatrixAccessor {
 	
 	__device__ void init(Matrix<T>* mat) { _mat = mat; }
 	
-	__device__ uint32_t& nnz() { return _mat->nnz; }
+	__device__ uint32_t& nnz() { return return _mat->row_ptr == nullptr ? _mat->rows * _mat->cols : _mat->nnz; }
 	__device__ uint32_t cols() { return _mat->cols; }
 	__device__ uint32_t rows() { return _mat->rows; }
 	
@@ -133,7 +133,7 @@ class MatrixAccessor {
 	
 	//ToDo sparse accessors
 	__device__ uint32_t len_sparse() {
-		return _mat->nnz;
+		return _mat->row_ptr[_mat->rows];
 	}
 	
 	__device__ uint32_t pos_sparse(uint32_t rix) {
@@ -227,34 +227,6 @@ class RingBuffer {
 	}
 };
 
-template <typename T, int NUM_B>
-struct SpoofOp {
-	MatrixAccessor<T> a;
-	MatrixAccessor<T> b[NUM_B];
-	MatrixAccessor<T> c;
-	T* scalars;
-	uint32_t grix;
-	T* avals;
-	uint32_t* aix;
-	uint32_t alen;
-	
-	SpoofOp(Matrix<T>* A, Matrix<T>* B, Matrix<T>* C, T* scalars, T* tmp_stor, uint32_t grix) :
-			scalars(scalars), grix(grix), avals(A->data), aix(A->col_idx) {
-		a.init(A);
-		c.init(C);
-		alen = a.row_len(grix);
-
-		if(B)
-			for(auto i = 0; i < NUM_B; ++i)
-				b[i].init(&(B[i]));
-	}
-	
-//	__device__ Vector<T>& getTempStorage(uint32_t len) {
-//		Vector<T>& vec = temp_rb.next();
-//		tvec.length = len;
-//		return vec;
-//	}
-};
 #endif // __CUDACC_RTC__
 
 #endif //SYSTEMDS_MATRIX_H
diff --git a/src/main/cuda/headers/reduction.cuh b/src/main/cuda/headers/reduction.cuh
index 3cd0a0b3946..7707fe2a40c 100644
--- a/src/main/cuda/headers/reduction.cuh
+++ b/src/main/cuda/headers/reduction.cuh
@@ -22,6 +22,7 @@
 #define REDUCTION_CUH
 
 using uint = unsigned int;
+
 #include <cuda_runtime.h>
 
 #include "utils.cuh"
@@ -51,7 +52,9 @@ using uint = unsigned int;
  * @param SpoofCellwiseOp		initial value for the reduction variable
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) {
+__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op,
+	SpoofCellwiseOp spoof_op)
+{
 	auto sdata = shared_memory_proxy<T>();
 
 	// perform first level of reduction,
@@ -66,12 +69,9 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 	// number of active thread blocks (via gridDim).  More blocks will result
 	// in a larger gridSize and therefore fewer elements per thread
 	while (i < N) {
-//		printf("tid=%d i=%d N=%d, in->cols()=%d rix=%d\n", threadIdx.x, i, N, in->cols(), i/in->cols());
 		v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / in->cols(), i % in->cols()));
 
 		if (i + blockDim.x < N)	{
-			//__syncthreads();
-			//printf("loop fetch i(%d)+blockDim.x(%d)=%d, in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]);
 			v = reduction_op(v, spoof_op(*(in->vals(i+blockDim.x)), blockDim.x + i, (i+blockDim.x) / in->cols(), (i+blockDim.x) % in->cols()));
 		}
 
@@ -116,40 +116,25 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 		if (blockDim.x >= 64) {
 			smem[tid] = v = reduction_op(v, smem[tid + 32]);
 		}
-//		if(tid<12)
-//			printf("bid=%d tid=%d reduction result: %3.1f\n", blockIdx.x, tid, sdata[tid]);
-		
 		if (blockDim.x >= 32) {
 			smem[tid] = v = reduction_op(v, smem[tid + 16]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 16) {
 			smem[tid] = v = reduction_op(v, smem[tid + 8]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 8) {
 			smem[tid] = v = reduction_op(v, smem[tid + 4]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 4) {
 			smem[tid] = v = reduction_op(v, smem[tid + 2]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 2) {
 			smem[tid] = v = reduction_op(v, smem[tid + 1]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 	}
 
 	 // write result for this block to global mem
 	 if (tid == 0) {
-//	 	if(gridDim.x < 10)
-//	 		printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 	 	out->val(0, blockIdx.x) = sdata[0];
 	 }
 }
@@ -174,19 +159,10 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
  * the value before writing it to its final location in global memory for each
  * row
  */
-//template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-//__device__ void ROW_AGG(
-//		T *g_idata, ///< input data stored in device memory (of size rows*cols)
-//		T *g_odata,  ///< output/temporary array store in device memory (of size
-//		/// rows*cols)
-//		uint rows,  ///< rows in input and temporary/output arrays
-//		uint cols,  ///< columns in input and temporary/output arrays
-//		T initialValue,  ///< initial value for the reduction variable
-//		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
-//		SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT,  ReductionOp reduction_op,
-					   SpoofCellwiseOp spoof_op) {
+	SpoofCellwiseOp spoof_op)
+{
 	auto sdata = shared_memory_proxy<T>();
 
 	// one block per row
@@ -199,7 +175,6 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 	uint32_t i = tid;
 	uint block_offset = block * in->cols();
 
-//	T v = initialValue;
 	T v = reduction_op.init();
 	while (i < in->cols()) {
 		v = reduction_op(v, spoof_op(in->val(block_offset + i), i, i / in->cols(), i % in->cols()));
@@ -283,16 +258,8 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void COL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT,  ReductionOp reduction_op,
-						SpoofCellwiseOp spoof_op) {
-//__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols)
-//		T *g_odata,  ///< output/temporary array store in device memory (of size rows*cols)
-//		uint rows,  ///< rows in input and temporary/output arrays
-//		uint cols,  ///< columns in input and temporary/output arrays
-//		T initialValue,  ///< initial value for the reduction variable
-//		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
-//		SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation
-//
-//{
+	SpoofCellwiseOp spoof_op)
+{
 	uint global_tid = blockIdx.x * blockDim.x + threadIdx.x;
 	if (global_tid >= in->cols()) {
 		return;
@@ -315,13 +282,12 @@ __device__ void NO_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N
 	uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x;
 	uint32_t first_idx = gtid * static_cast<uint32_t>(VT);
 	uint32_t last_idx = min(first_idx + static_cast<uint32_t>(VT), N);
+
 	#pragma unroll
 	for(auto i = first_idx; i < last_idx; i++) {
 		T a = in->hasData() ? in->vals(0)[i] : 0;
 		T result = spoof_op(a, i, i / in->cols(), i % in->cols());
 		out->vals(0)[i] = result;
-		//if(i < 4)
-		//	printf("tid=%d in=%4.3f res=%4.3f out=%4.3f r=%d\n", i, in->vals(0)[i], result, out->vals(0)[i], i/in->cols());
 	}
 }
 
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index 233a4a179c6..2ef482d18dd 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -77,10 +77,15 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resourc
 	CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
 	ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG, SpoofOperator::AggOp::MAX), func));
 	
+	CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(ctx->staging_buffer)), ctx->default_mem_size));
+	CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(ctx->device_buffer)), ctx->default_mem_size));
+	ctx->current_mem_size = ctx->default_mem_size;
 	return reinterpret_cast<size_t>(ctx);
 }
 
 void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]] uint32_t device_id) {
+	cudaFreeHost(ctx->staging_buffer);
+	cudaFree(ctx->device_buffer);
 	delete ctx;
 	// cuda device is handled by jCuda atm
 	//cudaDeviceReset();
@@ -116,15 +121,25 @@ size_t SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const std::s
 
 template<typename T>
 CUfunction SpoofCUDAContext::getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp> &key) {
-	return nullptr;
+	return nullptr; // generic case never used
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<float>(const std::pair<SpoofOperator::AggType,
 		SpoofOperator::AggOp> &key) {
 	return reduction_kernels_f[key];
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<double>(const std::pair<SpoofOperator::AggType,
 		SpoofOperator::AggOp> &key) {
 	return reduction_kernels_d[key];
-}
\ No newline at end of file
+}
+
+void SpoofCUDAContext::resize_staging_buffer(size_t size) {
+	cudaFreeHost(staging_buffer);
+	cudaFree(device_buffer);
+	CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(staging_buffer)), size));
+	CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(device_buffer)), size));
+	current_mem_size = size;
+}
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index 696682fc228..e4b80c5e401 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -46,62 +46,34 @@ class SpoofCUDAContext {
 	std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_f;
 	std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_d;
 
-//	double handling_total, compile_total;
-	
 	const std::string resource_path;
 	const std::vector<std::string> include_paths;
 	
 public:
+	size_t default_mem_size = 1024; // 1kb for hosting data pointers, scalars and some meta info. This default should
+									// not require resizing these buffers in most cases.
+	size_t current_mem_size = 0; // the actual staging buffer size (should be default unless there was a resize)
+	std::byte* staging_buffer{}; // pinned host mem for async transfers
+	std::byte* device_buffer{};  // this buffer holds the pointers to the data buffers
 
 	explicit SpoofCUDAContext(const char* resource_path_, std::vector<std::string>  include_paths_) : reductions(nullptr),
-			resource_path(resource_path_), include_paths(std::move(include_paths_))
-			//,handling_total(0.0), compile_total(0.0)
-			{}
+			resource_path(resource_path_), include_paths(std::move(include_paths_)) { }
 
 	static size_t initialize_cuda(uint32_t device_id, const char* resource_path_);
 
 	static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
-	
+
 	size_t compile(std::unique_ptr<SpoofOperator> op, const std::string &src);
-	
+
 	template <typename T, typename CALL>
-	int launch(uint32_t opID, std::vector<Matrix<T>>& input, std::vector<Matrix<T>>& sides, Matrix<T>& output,
-			T* scalars, uint32_t grix) {
-		// dp holds in/side/out/scalar pointers for GPU
-		DevMatPtrs<T> dp;
-
-		SpoofOperator* op = compiled_ops[opID].get();
-		
-		CHECK_CUDART(cudaMalloc((void **)&dp.in, sizeof(Matrix<T>) * input.size()));
-		CHECK_CUDART(cudaMemcpy(dp.in, reinterpret_cast<void*>(&input[0]), sizeof(Matrix<T>) * input.size(),
-				cudaMemcpyHostToDevice));
-
-		if (!sides.empty()) {
-			CHECK_CUDART(cudaMalloc(reinterpret_cast<void **>(&dp.sides), sizeof(Matrix<T>) * sides.size()));
-			CHECK_CUDART(cudaMemcpy(dp.sides, &sides[0], sizeof(Matrix<T>)  * sides.size(), cudaMemcpyHostToDevice));
-		}
-		
-		if (op->isSparseSafe() && input.front().row_ptr != nullptr) {
-			CHECK_CUDART(cudaMemcpy(output.row_ptr, input.front().row_ptr, (input.front().rows+1)*sizeof(uint32_t),
-					cudaMemcpyDeviceToDevice));
-		}
-#ifndef NDEBUG
-		std::cout << "output rows: " << output.rows << " cols: " << output.cols << " nnz: " << output.nnz << " format: " <<
-				(output.row_ptr == nullptr ? "dense" : "sparse") << std::endl;
-#endif
-		size_t out_num_elements = output.rows * output.cols;
-		if(output.row_ptr)
-			if(op->isSparseSafe() && output.nnz > 0)
-				out_num_elements = output.nnz;
-		CHECK_CUDART(cudaMalloc((void **) &dp.out, sizeof(Matrix<T>)));
-		CHECK_CUDART(cudaMemset(output.data, 0, out_num_elements * sizeof(T)));
-		CHECK_CUDART(cudaMemcpy(dp.out, reinterpret_cast<void *>(&output), sizeof(Matrix<T>),
-				cudaMemcpyHostToDevice));
-		
-		dp.scalars = scalars;
-
-		CALL::exec(this, op, input, sides, output, grix, dp);
-		
+	int launch() {
+
+		DataBufferWrapper dbw(staging_buffer, device_buffer);
+		SpoofOperator* op = compiled_ops[dbw.op_id()].get();
+		dbw.toDevice(op->stream);
+
+		CALL::exec(this, op, &dbw);
+
 		return 0;
 	}
 	
@@ -109,6 +81,8 @@ class SpoofCUDAContext {
 
 	template<typename T>
 	CUfunction getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>& key);
+
+	void resize_staging_buffer(size_t size);
 };
 
 #endif // SPOOFCUDACONTEXT_H
diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h b/src/main/cuda/spoof-launcher/SpoofCellwise.h
index fe7e9d66688..9077840020d 100644
--- a/src/main/cuda/spoof-launcher/SpoofCellwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h
@@ -27,8 +27,7 @@
 template<typename T>
 struct SpoofCellwiseFullAgg {
 	
-	static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name,
-			std::vector<Matrix<T>>& sides, uint32_t grix,  DevMatPtrs<T>& dp) {
+	static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) {
 		T value_type;
 		
 		// num ctas
@@ -46,14 +45,15 @@ struct SpoofCellwiseFullAgg {
 						  << std::endl;
 #endif
 		CHECK_CUDA(op->program.get()->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 		if(NB > 1) {
 			N = NB;
 			while (NB > 1) {
-				void* args[3] = { &dp.out, &dp.out, &N};
+				Matrix<T>* out = dbw->d_out<T>();
+				void* args[3] = { &out, &out, &N};
 				
 				NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
 #ifndef NDEBUG
@@ -64,7 +64,7 @@ struct SpoofCellwiseFullAgg {
                     << N << " elements"
                     << std::endl;
 #endif
-				CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, nullptr, args, nullptr));
+				CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, op->stream, args, nullptr));
 				N = NB;
 			}
 		}
@@ -74,12 +74,11 @@ struct SpoofCellwiseFullAgg {
 
 template<typename T>
 struct SpoofCellwiseRowAgg {
-	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name,
-			  std::vector<Matrix<T>> &input, std::vector<Matrix<T>> &sides, uint32_t grix, DevMatPtrs<T>& dp) {
+	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) {
 		T value_type;
 		
 		// num ctas
-		uint32_t NB = input.front().rows;
+		uint32_t NB = dbw->h_in<T>(0)->rows;
 		dim3 grid(NB, 1, 1);
 		dim3 block(NT, 1, 1);
 		uint32_t shared_mem_size = NT * sizeof(T);
@@ -90,9 +89,9 @@ struct SpoofCellwiseRowAgg {
 					<< N << " elements" << std::endl;
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 	}
 };
@@ -100,8 +99,7 @@ struct SpoofCellwiseRowAgg {
 
 template<typename T>
 struct SpoofCellwiseColAgg {
-	static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name,
-					 std::vector<Matrix<T>>& sides, uint32_t grix, DevMatPtrs<T>& dp) {
+	static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) {
 		T value_type;
 		
 		// num ctas
@@ -116,9 +114,9 @@ struct SpoofCellwiseColAgg {
 						<< N << " elements" << std::endl;
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 	}
 };
@@ -126,72 +124,78 @@ struct SpoofCellwiseColAgg {
 
 template<typename T>
 struct SpoofCellwiseNoAgg {
-	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name,
-			  std::vector<Matrix<T>> &input, std::vector<Matrix<T>> &sides, uint32_t grix, DevMatPtrs<T>& dp) {
+	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) {
 		T value_type;
-		bool sparse_input = input.front().row_ptr != nullptr;
+		bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
 		
 		// num ctas
-		// ToDo: adaptive VT
+		// ToDo? adaptive VT
 		const uint32_t VT = 1;
 		uint32_t NB = std::ceil((N + NT * VT - 1) / (NT * VT));
 		if(sparse_input)
-			NB = input.front().rows;
+			NB = dbw->h_in<T>(0)->rows;
 		dim3 grid(NB, 1, 1);
 		dim3 block(NT, 1, 1);
 		uint32_t shared_mem_size = 0;
 
 #ifndef NDEBUG
+		std::cout << "output rows: " << dbw->h_out<T>()->rows << " cols: " << dbw->h_out<T>()->cols << " nnz: " <<
+			(dbw->h_out<T>()->row_ptr == nullptr ? dbw->h_out<T>()->rows * dbw->h_out<T>()->cols :
+				dbw->h_out<T>()->nnz) << " format: " << (dbw->h_out<T>()->row_ptr == nullptr
+					? "dense" : "sparse") << std::endl;
+
 		if(sparse_input) {
 				std::cout << "launching sparse spoof cellwise kernel " << op_name << " with " << NT * NB
-						  << " threads in " << NB << " blocks without aggregation for " << N << " elements"
-						  << std::endl;
+			  		<< " threads in " << NB << " blocks without aggregation for " << N << " elements" << std::endl;
 		}
 		else {
-			std::cout << "launching spoof cellwise kernel " << op_name << " with " << NT * NB
-					  << " threads in " << NB << " blocks without aggregation for " << N << " elements"
-					  << std::endl;
+			std::cout << "launching spoof cellwise kernel " << op_name << " with " << NT * NB << " threads in " << NB <<
+				" blocks without aggregation for " << N << " elements" << std::endl;
 		}
 #endif
-		
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
+
+		// copy over row indices from input to output if appropriate
+		if (op->isSparseSafe() && dbw->h_in<T>(0)->row_ptr != nullptr) {
+			// src/dst information (pointer address) is stored in *host* buffer!
+			CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->row_ptr, dbw->h_in<T>(0)->row_ptr,
+				(dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream));
+			CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->col_idx, dbw->h_in<T>(0)->col_idx,
+										 (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream));
+		}
 	}
 };
 
 template<typename T>
 struct SpoofCellwise {
-	static void exec(SpoofCUDAContext* ctx, SpoofOperator* _op, std::vector<Matrix<T>>& input,
-			std::vector<Matrix<T>>& sides, Matrix<T>& output, uint32_t grix,
-			DevMatPtrs<T>& dp)  {
-		
-		T value_type;
+	static void exec(SpoofCUDAContext* ctx, SpoofOperator* _op, DataBufferWrapper* dbw) {
 		auto* op = dynamic_cast<SpoofCellwiseOp*>(_op);
-		bool sparse_input = input.front().row_ptr != nullptr;
+		bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
 		uint32_t NT = 256; // ToDo: num threads
-		uint32_t N = input.front().rows * input.front().cols;
+		uint32_t N = dbw->h_in<T>(0)->rows * dbw->h_in<T>(0)->cols;
 		std::string op_name(op->name + "_DENSE");
 		if(sparse_input) {
 			op_name = std::string(op->name + "_SPARSE");
-			if(op->isSparseSafe() && input.front().nnz > 0)
-				N = input.front().nnz;
+			if(op->isSparseSafe() && dbw->h_in<T>(0)->nnz > 0)
+				N = dbw->h_in<T>(0)->nnz;
 		}
 		
 		switch(op->agg_type) {
 			case SpoofOperator::AggType::FULL_AGG:
 				op->agg_kernel = ctx->template getReductionKernel<T>(std::make_pair(op->agg_type, op->agg_op));
-				SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, sides, grix, dp);
+				SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			case SpoofOperator::AggType::ROW_AGG:
-				SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, input, sides, grix, dp);
+				SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			case SpoofOperator::AggType::COL_AGG:
-				SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, sides, grix, dp);
+				SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			case SpoofOperator::AggType::NO_AGG:
-				SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, input, sides, grix, dp);
+				SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			default:
 				throw std::runtime_error("unknown cellwise agg type" + std::to_string(static_cast<int>(op->agg_type)));
diff --git a/src/main/cuda/spoof-launcher/SpoofOperator.h b/src/main/cuda/spoof-launcher/SpoofOperator.h
index 0ccc633a47f..f256e817db2 100644
--- a/src/main/cuda/spoof-launcher/SpoofOperator.h
+++ b/src/main/cuda/spoof-launcher/SpoofOperator.h
@@ -28,18 +28,24 @@
 #include "host_utils.h"
 #include "Matrix.h"
 
+// these two constants have equivalents in Java code:
+const uint32_t JNI_MAT_ENTRY_SIZE = 40;
+const uint32_t TRANSFERRED_DATA_HEADER_SIZE = 32;
+
 struct SpoofOperator {
-//	enum class OpType : int { CW, RA, MA, OP, NONE };
 	enum class AggType : int { NO_AGG, FULL_AGG, ROW_AGG, COL_AGG };
 	enum class AggOp : int { SUM, SUM_SQ, MIN, MAX };
 	enum class RowType : int { FULL_AGG = 4 };
-	
-//	OpType op_type;
+
 	std::string name;
-//	jitify::Program program;
 	std::unique_ptr<jitify::Program> program;
 	
 	[[nodiscard]] virtual bool isSparseSafe() const = 0;
+
+	cudaStream_t stream{};
+	
+	SpoofOperator() { CHECK_CUDART(cudaStreamCreate(&stream));}
+	virtual ~SpoofOperator() {CHECK_CUDART(cudaStreamDestroy(stream));}
 };
 
 struct SpoofCellwiseOp : public SpoofOperator {
@@ -58,46 +64,90 @@ struct SpoofRowwiseOp : public SpoofOperator {
 	int32_t const_dim2;
 	RowType row_type;
 	
-	SpoofRowwiseOp(RowType rt, bool tb1, uint32_t ntv, int32_t cd2)  : row_type(rt), TB1(tb1), num_temp_vectors(ntv),
+	SpoofRowwiseOp(RowType rt, bool tb1, uint32_t ntv, int32_t cd2) : row_type(rt), TB1(tb1), num_temp_vectors(ntv),
 			const_dim2(cd2) {}
 			
 	[[nodiscard]] bool isSparseSafe() const override { return false; }
 };
 
-template<typename T>
-struct DevMatPtrs {
-	Matrix<T>* ptrs[3] = {0,0,0};
-	
-	Matrix<T>*& in = ptrs[0];
-	Matrix<T>*& sides = ptrs[1];
-	Matrix<T>*& out = ptrs[2];
-	T* scalars{};
-
-	~DevMatPtrs() {
-#ifndef NDEBUG
-		std::cout << "~DevMatPtrs() before cudaFree:\n";
-		int i = 0;
-		for (auto& p : ptrs) {
-			std::cout << " p[" << i << "]=" << p;
-			i++;
-		}
-		std::cout << std::endl;
-#endif
-		for (auto& p : ptrs) {
-			if (p) {
-				CHECK_CUDART(cudaFree(p));
-				p = nullptr;
-			}
-		}
-#ifndef NDEBUG
-		std::cout << "~DevMatPtrs() after cudaFree:\n";
-		i = 0;
-		for (auto& p : ptrs) {
-			std::cout << " p[" << i << "]=" << p;
-			i++;
-		}
-		std::cout << std::endl;
-#endif
+struct DataBufferWrapper {
+	std::byte* staging_buffer;
+	std::byte* device_buffer;
+
+
+	template<typename T>
+	Matrix<T>* in(std::byte* buffer, uint32_t idx) {
+		return reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + idx * JNI_MAT_ENTRY_SIZE]);
+	}
+
+	template<typename T>
+	Matrix<T>* sides(std::byte* buffer) {
+		return reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + num_inputs() * JNI_MAT_ENTRY_SIZE]);
+	}
+
+	template<typename T>
+	Matrix<T>* out(std::byte* buffer) {
+		return reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + (num_inputs() + num_sides())
+			* JNI_MAT_ENTRY_SIZE]);
+	}
+
+	template<typename T>
+	T* scalars(std::byte* buffer, uint32_t idx) {
+		return reinterpret_cast<T*>(&(buffer[TRANSFERRED_DATA_HEADER_SIZE + (num_inputs() + num_sides() + 1)
+			* JNI_MAT_ENTRY_SIZE + idx * sizeof(T)]));
+	}
+
+public:
+	explicit DataBufferWrapper(std::byte* staging, std::byte* dev_buf) : staging_buffer(staging),
+		device_buffer(dev_buf) { }
+
+	void toDevice(cudaStream_t &stream) const {
+		CHECK_CUDART(cudaMemcpyAsync(device_buffer, staging_buffer, *reinterpret_cast<uint32_t*>(&staging_buffer[0]),
+			cudaMemcpyHostToDevice, stream));
+	}
+
+	template<typename T>
+	Matrix<T>* d_in(uint32_t num) { return in<T>(device_buffer, num); }
+
+	template<typename T>
+	Matrix<T>* h_in(uint32_t num) { return in<T>(staging_buffer, num); }
+
+	template<typename T>
+	Matrix<T>* d_sides() { return sides<T>(device_buffer); }
+
+	template<typename T>
+	Matrix<T>* h_sides() { return sides<T>(staging_buffer); }
+
+	template<typename T>
+	Matrix<T>* d_out() { return out<T>(device_buffer); }
+
+	template<typename T>
+	Matrix<T>* h_out() { return out<T>(staging_buffer); }
+
+	template<typename T>
+	T* d_scalars(uint32_t idx = 0) { return scalars<T>(device_buffer, idx); }
+
+	template<typename T>
+	T* h_scalars(uint32_t idx = 0) { return scalars<T>(staging_buffer, idx); }
+
+	uint32_t op_id() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[sizeof(int)]);
+	}
+
+	uint64_t grix() const {
+		return *reinterpret_cast<uint64_t*>(&staging_buffer[2 * sizeof(int)]);
+	}
+
+	[[nodiscard]] uint32_t num_inputs() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[3 * sizeof(int)]);
+	}
+
+	uint32_t num_sides() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[4 * sizeof(int)]);
+	}
+
+	uint32_t num_scalars() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[6 * sizeof(int)]);
 	}
 };
 
diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h b/src/main/cuda/spoof-launcher/SpoofRowwise.h
index 12953141098..4465ac99fa4 100644
--- a/src/main/cuda/spoof-launcher/SpoofRowwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h
@@ -27,27 +27,34 @@
 template <typename T>
 struct SpoofRowwise {
 	
-	static void exec([[maybe_unused]] SpoofCUDAContext* ctx, SpoofOperator* _op, std::vector<Matrix<T>>& input,
-			std::vector<Matrix<T>>& sides, Matrix<T>& output, uint32_t grix, DevMatPtrs<T>& dp)  {
+	static void exec([[maybe_unused]] SpoofCUDAContext* ctx, SpoofOperator* _op, DataBufferWrapper* dbw)  {
 		uint32_t NT=256;
 		T value_type;
-		bool sparse_input = input.front().row_ptr != nullptr;
+		bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
 		auto* op = dynamic_cast<SpoofRowwiseOp*>(_op);
-		dim3 grid(input.front().rows, 1, 1);
+		dim3 grid(dbw->h_in<T>(0)->rows, 1, 1);
 		dim3 block(NT, 1, 1);
 		unsigned int shared_mem_size = NT * sizeof(T);
-		
+
+		size_t out_num_elements = dbw->h_out<T>()->rows * dbw->h_out<T>()->cols;
+		if(dbw->h_out<T>()->row_ptr)
+			if(op->isSparseSafe() && dbw->h_out<T>()->nnz > 0)
+				out_num_elements = dbw->h_out<T>()->nnz;
+		//ToDo: only memset output when there is an output operation that *adds* to the buffer
+		CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, out_num_elements * sizeof(T), op->stream));
+
+		//ToDo: handle this in JVM
 		uint32_t tmp_len = 0;
 		uint32_t temp_buf_size = 0;
 		T* d_temp = nullptr;
 		if(op->num_temp_vectors > 0) {
-			tmp_len = std::max(input.front().cols, op->const_dim2 < 0 ? 0 : static_cast<uint32_t>(op->const_dim2));
-			temp_buf_size = op->num_temp_vectors * tmp_len * input.front().rows * sizeof(T);
+			tmp_len = std::max(dbw->h_in<T>(0)->cols, op->const_dim2 < 0 ? 0u : static_cast<uint32_t>(op->const_dim2));
+			temp_buf_size = op->num_temp_vectors * tmp_len * dbw->h_in<T>(0)->rows * sizeof(T);
 #ifndef NDEBUG
 			std::cout << "num_temp_vect: " << op->num_temp_vectors << " temp_buf_size: " << temp_buf_size << " tmp_len: " << tmp_len << std::endl;
 #endif
 			CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size));
-			CHECK_CUDART(cudaMemset(d_temp, 0, temp_buf_size));
+			CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, op->stream));
 		}
 		
 		std::string op_name(op->name + "_DENSE");
@@ -56,21 +63,18 @@ struct SpoofRowwise {
 
 #ifndef NDEBUG
 		// ToDo: connect output to SystemDS logging facilities
-		std::cout << "launching spoof rowwise kernel " << op_name << " with " << NT * input.front().rows << " threads in "
-				<< input.front().rows << " blocks and " << shared_mem_size << " bytes of shared memory for "
-				<< input.front().rows << " cols processed by " << NT << " threads per row, adding "
+		std::cout << "launching spoof rowwise kernel " << op_name << " with " << NT * dbw->h_in<T>(0)->rows << " threads in "
+				<< dbw->h_in<T>(0)->rows << " blocks and " << shared_mem_size << " bytes of shared memory for "
+				<< dbw->h_in<T>(0)->rows << " cols processed by " << NT << " threads per row, adding "
 				<< temp_buf_size / 1024 << " kb of temp buffer in global memory." <<  std::endl;
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()), op->num_temp_vectors, tmp_len)
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, d_temp, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1), dbw->num_sides()), op->num_temp_vectors, tmp_len)
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), d_temp, dbw->grix()));
 		
 		if(op->num_temp_vectors > 0)
 			CHECK_CUDART(cudaFree(d_temp));
-		
-//		if (op->TB1)
-//			CHECK_CUDART(cudaFree(b1_transposed));
 	}
 };
 
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp b/src/main/cuda/spoof-launcher/jni_bridge.cpp
index 9a808a62731..5134d5e292c 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.cpp
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -23,69 +23,86 @@
 #include "SpoofRowwise.h"
 
 // JNI Methods to get/release arrays
-#define GET_ARRAY(env, input)((void *)env->GetPrimitiveArrayCritical(input, nullptr))
+//#define GET_ARRAY(env, input)((void *)env->GetPrimitiveArrayCritical(input, nullptr))
+//#define RELEASE_ARRAY(env, java, cpp)(env->ReleasePrimitiveArrayCritical(java, cpp, 0))
 
-#define RELEASE_ARRAY(env, java, cpp)(env->ReleasePrimitiveArrayCritical(java, cpp, 0))
+jclass jcuda_pointer_class;
+jclass jcuda_native_pointer_class;
+jfieldID pointer_buffer_field;
+jfieldID native_pointer_field;
 
 // error output helper
-void printException(const std::string& name, const std::exception& e, bool compile = false) {
+void printException(const std::string &name, const std::exception &e, bool compile = false) {
 	std::string type = compile ? "compiling" : "executing";
-	std::cout << "std::exception while " << type << "  SPOOF CUDA operator " << name << ":\n" << e.what() << std::endl;
+	std::cerr << "std::exception while " << type << "  SPOOF CUDA operator " << name << ":\n" << e.what() << std::endl;
 }
 
 
-// a pod struct to have names for the passed pointers
-template<typename T>
-struct LaunchMetadata {
-	const T& opID;
-	const T& grix;
-	const size_t& num_inputs;
-	const size_t& num_sides;
-	
-	// num entries describing one matrix (6 entries):
-	// {nnz,rows,cols,row_ptr,col_idxs,data}
-	const size_t& entry_size;
-	const T& num_scalars;
-	
-	explicit LaunchMetadata(const size_t* jvals) : opID(jvals[0]), grix(jvals[1]), num_inputs(jvals[2]),
-			num_sides(jvals[3]), entry_size(jvals[4]), num_scalars(jvals[5]) {}
-};
-
-
-[[maybe_unused]] JNIEXPORT jlong JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
-    	JNIEnv *jenv, [[maybe_unused]] jobject jobj, jint device_id, jstring resource_path) {
+[[maybe_unused]] JNIEXPORT jlong JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context
+	(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jint device_id, jstring resource_path)
+{
 	const char *cstr_rp = jenv->GetStringUTFChars(resource_path, nullptr);
 	size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
 	jenv->ReleaseStringUTFChars(resource_path, cstr_rp);
+
+	// fetch some jcuda class handles
+	jcuda_pointer_class = jenv->FindClass("jcuda/Pointer");
+	jcuda_native_pointer_class = jenv->FindClass("jcuda/NativePointerObject");
+	pointer_buffer_field = jenv->GetFieldID(jcuda_pointer_class, "buffer", "Ljava/nio/Buffer;");
+	native_pointer_field = jenv->GetFieldID(jcuda_native_pointer_class, "nativePointer", "J");
+
+	// explicit cast to make compiler and linter happy
 	return static_cast<jlong>(ctx);
 }
 
 
-[[maybe_unused]] JNIEXPORT void JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
-		[[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong ctx, jint device_id) {
+[[maybe_unused]] JNIEXPORT void JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context
+	([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong ctx, jint device_id) {
 	SpoofCUDAContext::destroy_cuda(reinterpret_cast<SpoofCUDAContext *>(ctx), device_id);
 }
 
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofOperator_getNativeStagingBuffer
+	(JNIEnv *jenv, [[maybe_unused]] jclass jobj, jobject ptr, jlong _ctx, jint size) {
+	std::string operator_name("SpoofOperator_getNativeStagingBuffer");
+	try {
+		// retrieve data handles from JVM
+		auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
+		if (size > ctx->current_mem_size)
+			ctx->resize_staging_buffer(size);
+
+		jobject object = jenv->NewDirectByteBuffer(ctx->staging_buffer, size);
+		jenv->SetObjectField(ptr, pointer_buffer_field, object);
+		jenv->SetLongField(ptr,native_pointer_field, reinterpret_cast<jlong>(ctx->staging_buffer));
+
+		return 0;
+	}
+	catch (std::exception & e) {
+		printException(operator_name, e);
+	}
+	catch (...) {
+		printException(operator_name, std::runtime_error("unknown exception"), true);
+	}
+	return -1;
+}
 
 template<typename TEMPLATE>
-int compile_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong _ctx, jstring name, jstring src, TEMPLATE op) {
+int compile_spoof_operator
+	(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong _ctx, jstring name, jstring src, TEMPLATE op) {
 	std::string operator_name;
 	try {
 		auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
 		const char *cstr_name = jenv->GetStringUTFChars(name, nullptr);
 		const char *cstr_src = jenv->GetStringUTFChars(src, nullptr);
 		operator_name = cstr_name;
-
 		op->name = operator_name;
+
 		int status = ctx->compile(std::move(op), cstr_src);
-		
+
 		jenv->ReleaseStringUTFChars(src, cstr_src);
 		jenv->ReleaseStringUTFChars(name, cstr_name);
 		return status;
 	}
-	catch (std::exception& e) {
+	catch (std::exception &e) {
 		printException(operator_name, e, true);
 	}
 	catch (...) {
@@ -96,73 +113,41 @@ int compile_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong _c
 
 
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_hops_codegen_cplan_CNodeCell_compile_1nvrtc
-		(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint agg_op,
-				jboolean sparseSafe) {
-	
+	(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint agg_op, jboolean sparseSafe) {
 	std::unique_ptr<SpoofCellwiseOp> op = std::make_unique<SpoofCellwiseOp>(SpoofOperator::AggType(type),
-			SpoofOperator::AggOp(agg_op), sparseSafe);
-	
+																			SpoofOperator::AggOp(agg_op), sparseSafe);
+
 	return compile_spoof_operator<std::unique_ptr<SpoofCellwiseOp>>(jenv, jobj, ctx, name, src, std::move(op));
 }
 
-
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_hops_codegen_cplan_CNodeRow_compile_1nvrtc
-		(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint const_dim2,
-				jint num_vectors, jboolean TB1) {
-	
+	(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint const_dim2,
+	 jint num_vectors, jboolean TB1) {
 	std::unique_ptr<SpoofRowwiseOp> op = std::make_unique<SpoofRowwiseOp>(SpoofOperator::RowType(type), TB1,
-			num_vectors, const_dim2);
+																		  num_vectors, const_dim2);
 	return compile_spoof_operator<std::unique_ptr<SpoofRowwiseOp>>(jenv, jobj, ctx, name, src, std::move(op));
 }
 
 
 template<typename T, typename TEMPLATE>
-int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jclass jobj, jlong _ctx, jlongArray _meta, jlongArray in,
-		jlongArray _sides, jlongArray out, jlong _scalars) {
-	std::string operator_name("unknown");
+int launch_spoof_operator([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jclass jobj, jlong _ctx) {
+	std::string operator_name("launch_spoof_operator jni-bridge");
 	try {
 		// retrieve data handles from JVM
-		auto *metacast = reinterpret_cast<size_t *>(GET_ARRAY(jenv, _meta));
 		auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
-		auto *inputs = reinterpret_cast<size_t *>(GET_ARRAY(jenv, in));
-		auto *sides = reinterpret_cast<size_t *>(GET_ARRAY(jenv, _sides));
-		auto *output = reinterpret_cast<size_t *>(GET_ARRAY(jenv, out));
-//		auto *scalars = reinterpret_cast<T *>(GET_ARRAY(jenv, _scalars));
-		auto *scalars = reinterpret_cast<T *>(_scalars);
-		LaunchMetadata<size_t> meta(metacast);
-		
+
+#ifndef NDEBUG
+		uint32_t opID = *reinterpret_cast<uint32_t*>(&ctx->staging_buffer[sizeof(uint32_t)]);
 		// this implicitly checks if op exists
-		operator_name = ctx->getOperatorName(meta.opID);
-		
-		// wrap/cast inputs
-		std::vector<Matrix<T>> mats_in;
-		for(auto i = 0ul; i < meta.num_inputs; i+=meta.entry_size)
-			mats_in.emplace_back(&inputs[i]);
-		
-		// wrap/cast sides
-		std::vector<Matrix<T>> mats_sides;
-		for(auto i = 0ul; i < meta.num_sides; i+=meta.entry_size)
-			mats_sides.emplace_back(&sides[i]);
-		
-		// wrap/cast output
-		Matrix<T> mat_out(output);
-		
-		// wrap/cast scalars
-//		std::unique_ptr<Matrix<T>> mat_scalars = scalars == nullptr ? 0 : std::make_unique<Matrix<T>>(scalars);
-		
+		operator_name = ctx->getOperatorName(opID);
+		std::cout << "executing op=" << operator_name << " id=" << opID << std::endl;
+#endif
 		// transfers resource pointers to GPU and calls op->exec()
-		ctx->launch<T, TEMPLATE>(meta.opID, mats_in, mats_sides, mat_out, scalars, meta.grix);
-		
-		// release data handles from JVM
-		RELEASE_ARRAY(jenv, _meta, metacast);
-		RELEASE_ARRAY(jenv, in, inputs);
-		RELEASE_ARRAY(jenv, _sides, sides);
-		RELEASE_ARRAY(jenv, out, output);
-//		RELEASE_ARRAY(jenv, _scalars, scalars);
-		
+		ctx->launch<T, TEMPLATE>();
+
 		return 0;
 	}
-	catch (std::exception& e) {
+	catch (std::exception &e) {
 		printException(operator_name, e);
 	}
 	catch (...) {
@@ -171,26 +156,26 @@ int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jclass jobj, jlong _ctx
 	return -1;
 }
 
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<float, SpoofCellwise<float>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
-}
 
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1d
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, ctx);
 }
 
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<float, SpoofRowwise<float>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
+
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, ctx);
 }
 
+
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1d
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, ctx);
 }
+
+
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, ctx);
+}
\ No newline at end of file
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.h b/src/main/cuda/spoof-launcher/jni_bridge.h
index 1e9ef20d1a2..0f25936c1a6 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.h
+++ b/src/main/cuda/spoof-launcher/jni_bridge.h
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/* DO NOT EDIT THIS FILE - it is machine generated */
+/* DO EDIT THIS FILE - it is not machine generated */
 
 #pragma once
 #ifndef JNI_BRIDGE_H
@@ -32,20 +32,26 @@ extern "C" {
 /*
  * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
  * Method:    initialize_cuda_context
- * Signature: (I)J
+ * Signature: (ILjava/lang/String;)J
  */
-[[maybe_unused]] JNIEXPORT jlong JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
-    JNIEnv *, [[maybe_unused]] jobject, jint, jstring);
+[[maybe_unused]] JNIEXPORT jlong JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context
+	(JNIEnv *, [[maybe_unused]] jobject, jint, jstring);
 
 /*
  * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
  * Method:    destroy_cuda_context
  * Signature: (JI)V
  */
-[[maybe_unused]] JNIEXPORT void JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
-		[[maybe_unused]] JNIEnv *, [[maybe_unused]] jobject, jlong, jint);
+[[maybe_unused]] JNIEXPORT void JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context
+	([[maybe_unused]] JNIEnv *, [[maybe_unused]] jobject, jlong, jint);
+
+/*
+ * Class:     org_apache_sysds_runtime_codegen_SpoofOperator
+ * Method:    getNativeStagingBuffer
+ * Signature: (Ljcuda/Pointer;JI)I
+ */
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofOperator_getNativeStagingBuffer
+	(JNIEnv *, jclass, jobject, jlong, jint);
 
 /*
  * Class:     org_apache_sysds_hops_codegen_cplan_CNodeCell
@@ -64,37 +70,36 @@ Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
 		(JNIEnv *, [[maybe_unused]] jobject, jlong, jstring, jstring, jint, jint, jint, jboolean);
 
 /*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwiseOperator
- * Method:    execute_f
- * Signature: (J[J[J[J[JJ)I
- */
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
-
-/*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwiseOperator
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwise
  * Method:    execute_d
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1d
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
-
+	(JNIEnv *, jclass, jlong);
 
 /*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwise
  * Method:    execute_f
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
+	(JNIEnv *, jclass, jlong);
 
 /*
  * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
  * Method:    execute_d
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1d
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
+	(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
+ * Method:    execute_f
+ * Signature: (J)I
+ */
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
+	(JNIEnv *, jclass, jlong);
 
 #ifdef __cplusplus
 }
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
index 951f70901bd..6751b1db849 100644
--- a/src/main/cuda/spoof/cellwise.cu
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -42,7 +42,8 @@ struct SpoofCellwiseOp {
 		uint32_t _grix;
 
 	SpoofCellwiseOp(Matrix<T>* _A, Matrix<T>* _B, Matrix<T>* _C, T* scalars, uint32_t grix) :
-			n(_A->cols), scalars(scalars), _grix(grix) {
+		n(_A->cols), scalars(scalars), _grix(grix)
+	{
 		A.init(_A);
 		c.init(_C);
 		alen = A.row_len(grix);
diff --git a/src/main/cuda/spoof/rowwise.cu b/src/main/cuda/spoof/rowwise.cu
index 7d80d54e82a..b31ce0c2ce5 100644
--- a/src/main/cuda/spoof/rowwise.cu
+++ b/src/main/cuda/spoof/rowwise.cu
@@ -55,6 +55,8 @@ struct SpoofRowwiseOp //%HAS_TEMP_VECT%
 
 	__device__  __forceinline__ void exec_dense(uint32_t ai, uint32_t ci, uint32_t rix) {
 //%BODY_dense%
+		if (debug_row() && debug_thread())
+			printf("c[0]=%4.3f\n", c.vals(0)[0]);
 	}
 
 	__device__  __forceinline__ void exec_sparse(uint32_t ai, uint32_t ci, uint32_t rix) {
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index 47a7178f376..f8030007288 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -156,10 +156,12 @@ public String codegen(boolean sparse, GeneratorAPI _api) {
 		if(tmpDense.contains("grix"))
 			tmp = tmp.replace("//%NEED_GRIX%", "\t\tuint32_t grix=_grix + rix;");
 		else
-			tmp = tmp.replace("//%NEED_GRIX%", "");
-		tmp = tmp.replace("//%NEED_RIX%", "");
-		tmp = tmp.replace("//%NEED_CIX%", "");
-		
+			tmp = tmp.replace("//%NEED_GRIX%\n", "");
+
+		// remove empty lines
+//		if(!api.isJava())
+//			tmpDense = tmpDense.replaceAll("(?m)^[ \t]*\r?\n", "");
+
 		tmp = tmp.replace("%BODY_dense%", tmpDense);
 		
 		//Return last TMP. Square it for CUDA+SUM_SQ
@@ -171,10 +173,7 @@ public String codegen(boolean sparse, GeneratorAPI _api) {
 		tmp = tmp.replace("%AGG_OP_NAME%", (_aggOp != null) ? "AggOp." + _aggOp.name() : "null");
 		tmp = tmp.replace("%SPARSE_SAFE%", String.valueOf(isSparseSafe()));
 		tmp = tmp.replace("%SEQ%", String.valueOf(containsSeq()));
-		
-		// maybe empty lines
-		//tmp = tmp.replaceAll("(?m)^[ \t]*\r?\n", "");
-		
+
 		if(api == GeneratorAPI.CUDA) {
 			// ToDo: initial_value is misused to pass VT (values per thread) to no_agg operator
 			String agg_op = "IdentityOp";
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
index 0e21017e2b5..f2405d5b5c5 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
@@ -191,7 +191,7 @@ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
 				case MULT2:
 					return "	T %TMP% = %IN1% + %IN1%;\n";
 				case ABS:
-					return "	T %TMP% = fabs(%IN1%);\n";
+					return "\t\tT %TMP% = fabs(%IN1%);\n";
 				case SIN:
 					return "	T %TMP% = sin(%IN1%);\n";
 				case COS:
@@ -217,7 +217,7 @@ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
 				case LOG:
 					return "		T %TMP% = log(%IN1%);\n";
 				case ROUND:
-					return "	T %TMP% = round(%IN1%);\n";
+					return "\t\tT %TMP% = round(%IN1%);\n";
 				case CEIL:
 					return "	T %TMP% = ceil(%IN1%);\n";
 				case FLOOR:
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
index 05109413ce0..5a3211435a7 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
@@ -23,11 +23,11 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
@@ -38,101 +38,75 @@ public class SpoofCUDACellwise extends SpoofCellwise implements SpoofCUDAOperato
 	private static final Log LOG = LogFactory.getLog(SpoofCUDACellwise.class.getName());
 	private final int ID;
 	private final PrecisionProxy call;
-	private Pointer ptr;
 	private final SpoofCellwise fallback_java_op;
+	private final long ctx;
 	
 	public SpoofCUDACellwise(CellType type, boolean sparseSafe, boolean containsSeq, AggOp aggOp, int id,
-			PrecisionProxy ep, SpoofCellwise fallback) {
+		PrecisionProxy ep, SpoofCellwise fallback)
+	{
 		super(type, sparseSafe, containsSeq, aggOp);
 		ID = id;
 		call = ep;
-		ptr = null;
 		fallback_java_op = fallback;
+		ctx = SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);
 	}
 	
 	@Override
-	public ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects) {
+	public ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
+		ArrayList<ScalarObject> scalarObjects)
+	{
 		double[] result = new double[1];
-		// ToDo: this is a temporary "solution" before perf opt
-		int NT=256;
-		long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns();
-		long num_blocks = ((N + NT * 2 - 1) / (NT * 2));
-		Pointer ptr = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks);
-		long[] out = {1,1,1, 0, 0, GPUObject.getPointerAddress(ptr)};
-		int offset = 1;
-		if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, offset), 
-			prepareSideInputPointers(ec, inputs, offset, false), out, scalarObjects, 0 ) != 0) {
-			LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.\n");
-			// ToDo: java fallback
+		Pointer[] ptr = new Pointer[1];
+		packDataForTransfer(ec, inputs, scalarObjects, null, 1, ID, 0,false, ptr);
+		if(NotEmpty(inputs.get(0).getGPUObject(ec.getGPUContext(0)))) {
+			if(call.exec(this) != 0)
+				LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
 		}
-		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, result, getName(), false);
-		
+		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr[0], result, getName(), false);
+		ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr[0], DMLScript.EAGER_CUDA_FREE);
 		return new DoubleObject(result[0]);
 	}
 	
 	@Override public String getName() {
 		return getSpoofType();
 	}
-	
-	@Override public void setScalarPtr(Pointer _ptr) {
-		ptr = _ptr;
-	}
-	
-	@Override public Pointer getScalarPtr() {
-		return ptr;
-	}
-	
-	@Override public void releaseScalarGPUMemory(ExecutionContext ec) {
-		if(ptr != null) {
-			ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr, DMLScript.EAGER_CUDA_FREE);
-			ptr = null;
-		}
-	}
-	
+
 	@Override
-	public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects,
-			String outputName) {
-		
+	public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
+		ArrayList<ScalarObject> scalarObjects, String outputName)
+	{
 		long out_rows = ec.getMatrixObject(outputName).getNumRows();
 		long out_cols = ec.getMatrixObject(outputName).getNumColumns();
-		MatrixObject a = inputs.get(0);
-		GPUContext gctx = ec.getGPUContext(0);
-		int m = (int) a.getNumRows();
-		int n = (int) a.getNumColumns();
-		double[] scalars = prepInputScalars(scalarObjects);
+
 		if(_type == CellType.COL_AGG)
 			out_rows = 1;
 		else if(_type == SpoofCellwise.CellType.ROW_AGG)
 			out_cols = 1;
-		
+
+		double[] scalars = prepInputScalars(scalarObjects);
 		boolean sparseSafe = isSparseSafe() || ((inputs.size() < 2) && 
-				genexec( 0, new SideInput[0], scalars, m, n, 0, 0 ) == 0);
-		
-//		ec.setMetaData(outputName, out_rows, out_cols);
-		GPUObject g = a.getGPUObject(gctx);
-		boolean sparseOut = _type == CellType.NO_AGG && sparseSafe && g.isSparse();
-		
-		long nnz = g.getNnz("spoofCUDA" + getSpoofType(), false);
-		if(sparseOut)
-			LOG.warn("sparse out");
+				genexec( 0, new SideInput[0], scalars, (int) inputs.get(0).getNumRows(),
+					(int) inputs.get(0).getNumColumns(), 0, 0 ) == 0);
+
+		GPUObject in_obj = inputs.get(0).getGPUObject(ec.getGPUContext(0));
+		boolean sparseOut = _type == CellType.NO_AGG && sparseSafe && in_obj.isSparse();
+		long nnz = in_obj.getNnz("spoofCUDA" + getSpoofType(), false);
 		MatrixObject out_obj = sparseOut ?
 				(ec.getSparseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, (isSparseSafe() && nnz > 0) ?
 						nnz : out_rows * out_cols).getKey()) :
 				(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols).getKey());
-		
-		int offset = 1;
-		if(!inputIsEmpty(a.getGPUObject(gctx)) || !sparseSafe) {
-			if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, offset), prepareSideInputPointers(ec, inputs, offset, false),
-				prepareOutputPointers(ec, out_obj, sparseOut), scalarObjects, 0) != 0) {
-				LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.(ToDo)\n");
-				// ToDo: java fallback
-			}
+
+		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,false, null);
+		if(NotEmpty(in_obj) || !sparseSafe) {
+			if(call.exec(this) != 0)
+				LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
 		}
 		return out_obj;
 	}
 	
-	private static boolean inputIsEmpty(GPUObject g) {
-		return g.getDensePointer() == null && g.getSparseMatrixCudaPointer() == null;
+	private static boolean NotEmpty(GPUObject g) {
+		// ToDo: check if that check is sufficient
+		return g.getDensePointer() != null || g.getSparseMatrixCudaPointer() != null;
 	}
 	
 	// used to determine sparse safety
@@ -140,15 +114,11 @@ private static boolean inputIsEmpty(GPUObject g) {
 	protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long gix, int rix, int cix) {
 		return fallback_java_op.genexec(a, b, scalars, m, n, 0, 0, 0);
 	}
-	
-	public int execute_sp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_f(ctx, meta, in, sides, out, scalars);	
-	}
-	
-	public int execute_dp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_d(ctx, meta, in, sides, out, scalars);
-	}
-	
-	public static native int execute_f(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
-	public static native int execute_d(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
+
+	public int execute_dp(long ctx) { return execute_d(ctx); }
+	public int execute_sp(long ctx) { return execute_d(ctx); }
+	public long getContext() { return ctx; }
+
+	public static native int execute_d(long ctx);
+	public static native int execute_s(long ctx);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
index 118f97a0e13..02fbf961aad 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
@@ -19,7 +19,10 @@
 
 package org.apache.sysds.runtime.codegen;
 
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import jcuda.Pointer;
+
 import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
@@ -27,146 +30,112 @@
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
-import java.util.ArrayList;
-
+import static jcuda.runtime.cudaError.cudaSuccess;
 import static org.apache.sysds.runtime.matrix.data.LibMatrixCUDA.sizeOfDataType;
 
 public interface SpoofCUDAOperator  {
-	int JNI_MAT_ENTRY_SIZE = 6;
+	// these two constants have equivalences in native code:
+	int JNI_MAT_ENTRY_SIZE = 40;
+	int TRANSFERRED_DATA_HEADER_SIZE = 32;
+
 	abstract class PrecisionProxy {
 		protected final long ctx;
 		
 		public PrecisionProxy() { ctx = SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);	}
 		
-		public abstract int exec(ExecutionContext ec, SpoofCUDAOperator op, int opID, long[] in, long[] sides, long[] out,
-				ArrayList<ScalarObject> scalarObjects, long grix);
-		
-		protected Pointer transferScalars(ExecutionContext ec, SpoofCUDAOperator op, int sizeOfDataType,
-				ArrayList<ScalarObject> scalarObjects) {
-			double[] s = SpoofOperator.prepInputScalars(scalarObjects);
-			Pointer ptr = ec.getGPUContext(0).allocate(op.getName(), (long) scalarObjects.size() * sizeOfDataType);
-			LibMatrixCUDA.cudaSupportFunctions.hostToDevice(ec.getGPUContext(0), s, ptr, op.getName());
-			return ptr;
-		}
+		public abstract int exec(SpoofCUDAOperator op);
 	}
 	
 	String getName();
-	
-	void setScalarPtr(Pointer ptr);
-	
-	Pointer getScalarPtr();
-	
-	void releaseScalarGPUMemory(ExecutionContext ec);
-	
-	default long [] prepareInputPointers(ExecutionContext ec, ArrayList<MatrixObject> inputs, int offset) {
-		long [] in = new long[offset * JNI_MAT_ENTRY_SIZE];
-		for(int i = 0; i < offset; i++) {
-			int j = i  * JNI_MAT_ENTRY_SIZE;
-			
-			if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) {
-				in[j] = ec.getGPUSparsePointerAddress(inputs.get(i)).nnz;
-				in[j + 1] = inputs.get(i).getNumRows();
-				in[j + 2] = inputs.get(i).getNumColumns();
-				in[j + 3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr);
-				in[j + 4] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd);
-				in[j + 5] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val);
-			}
-			else {
-				in[j] = inputs.get(i).getNnz();
-				in[j + 1] = inputs.get(i).getNumRows();
-				in[j + 2] = inputs.get(i).getNumColumns();
-				in[j + 5] = ec.getGPUDensePointerAddress(inputs.get(i));
-			}
-		}
-		return in;
+
+	default void writeMatrixDescriptorToBuffer(ByteBuffer dst, int rows, int cols, long row_ptr,
+		long col_idx_ptr, long data_ptr, long nnz)
+	{
+		dst.putLong(nnz);
+		dst.putInt(rows);
+		dst.putInt(cols);
+		dst.putLong(row_ptr);
+		dst.putLong(col_idx_ptr);
+		dst.putLong(data_ptr);
 	}
-	
-	default long [] prepareSideInputPointers(ExecutionContext ec, ArrayList<MatrixObject> inputs, int offset, boolean tB1) {
-		long[] sides = new long[(inputs.size() - offset) * JNI_MAT_ENTRY_SIZE];
-		for(int i = offset; i < inputs.size(); i++) {
-			int j = (i - offset)  * JNI_MAT_ENTRY_SIZE;
-			if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) {
-				sides[j] = ec.getGPUSparsePointerAddress(inputs.get(i)).nnz;
-				sides[j + 1] = inputs.get(i).getNumRows();
-				sides[j + 2] = inputs.get(i).getNumColumns();
-				sides[j + 3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr);
-				sides[j + 4] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd);
-				sides[j + 5] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val);
-			}
-			else {
-				if(tB1 && j == 0) {
-					long rows = inputs.get(i).getNumRows();
-					long cols = inputs.get(i).getNumColumns();
-					Pointer b1 = inputs.get(i).getGPUObject(ec.getGPUContext(0)).getDensePointer();
-					Pointer ptr = ec.getGPUContext(0).allocate(getName(), rows * cols * sizeOfDataType);
-					
-//					double[] tmp1 = new double[(int) (rows * cols)];
-//					LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), b1, tmp1, getName(), false);
-//
-//					System.out.println("Mat before transpose: rows=" + rows + " cols=" + cols + "\n");
-//					for(int m = 0; m < rows; m++) {
-//						StringBuilder sb = new StringBuilder();
-//						for(int n = 0; n < cols; n++)
-//							sb.append(" " + tmp1[(int) (cols * m + n)]);
-//						System.out.println(sb.toString());
-//					}
-					
-					LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(),
-						b1, ptr, rows, cols);
-					
-//					double[] tmp2 = new double[(int) (rows * cols)];
-//					LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, tmp2, getName(), false);
-//
-//					System.out.println("Mat after transpose: rows=" + cols + " cols=" + rows + "\n");
-//					for(int m = 0; m < cols; m++) {
-//						StringBuilder sb = new StringBuilder();
-//						for(int n = 0; n < rows; n++)
-//							sb.append(" " + tmp2[(int) (rows * m + n)]);
-//						System.out.println(sb.toString());
-//					}
 
-					sides[j] = inputs.get(i).getNnz();
-					sides[j + 1] = cols;
-					sides[j + 2] = rows;
-					sides[j + 5] = GPUObject.getPointerAddress(ptr);
-					
-				} else {
-					sides[j] = inputs.get(i).getNnz();
-					sides[j + 1] = inputs.get(i).getNumRows();
-					sides[j + 2] = inputs.get(i).getNumColumns();
-					sides[j + 5] = ec.getGPUDensePointerAddress(inputs.get(i));
-				}
+	default void prepareMatrixPointers(ByteBuffer buf, ExecutionContext ec, MatrixObject mo, boolean tB1) {
+		if(mo.getGPUObject(ec.getGPUContext(0)).isSparse()) {
+			writeMatrixDescriptorToBuffer(buf, (int)mo.getNumRows(), (int)mo.getNumColumns(),
+				GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).rowPtr),
+					GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).colInd),
+						GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).val),
+							ec.getGPUSparsePointerAddress(mo).nnz);
+		}
+		else {
+			if(tB1) {
+				int rows = (int)mo.getNumRows();
+				int cols = (int)mo.getNumColumns();
+				Pointer b1 = mo.getGPUObject(ec.getGPUContext(0)).getDensePointer();
+				Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType);
+				LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(), b1, ptr, rows, cols);
+				writeMatrixDescriptorToBuffer(buf, rows, cols, 0, 0, GPUObject.getPointerAddress(ptr), mo.getNnz());
+			} else {
+				writeMatrixDescriptorToBuffer(buf, (int)mo.getNumRows(), (int)mo.getNumColumns(), 0, 0,
+					ec.getGPUDensePointerAddress(mo), mo.getNnz());
 			}
 		}
-		return sides;
 	}
-	
-	default long[] prepareOutputPointers(ExecutionContext ec, MatrixObject output, boolean sparseOut) {
-		long[] out = {0,0,0,0,0,0};
 
-		if(sparseOut) {
-			out[0] = ec.getGPUSparsePointerAddress(output).nnz;
-			out[1] = output.getNumRows();
-			out[2] = output.getNumColumns();
-			out[3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).rowPtr);
-			out[4] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).colInd);
-			out[5] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).val);
+	default void packDataForTransfer(ExecutionContext ec, ArrayList<MatrixObject> inputs,
+		ArrayList<ScalarObject> scalarObjects, MatrixObject out_obj, int num_inputs, int ID, long grix, boolean tB1,
+			Pointer[] ptr)
+	{
+		int op_data_size = (inputs.size() + 1) * JNI_MAT_ENTRY_SIZE + scalarObjects.size() * Double.BYTES + TRANSFERRED_DATA_HEADER_SIZE;
+		Pointer staging = new Pointer();
+		if(SpoofOperator.getNativeStagingBuffer(staging, this.getContext(), op_data_size) != cudaSuccess)
+			throw new RuntimeException("Failed to get native staging buffer from spoof operator");
+		ByteBuffer buf = staging.getByteBuffer();
+		buf.putInt(op_data_size);
+		buf.putInt(ID);
+		buf.putInt((int)grix);
+		buf.putInt(num_inputs);
+		buf.putInt(inputs.size() - num_inputs);
+		buf.putInt(out_obj == null ? 0 : 1);
+		buf.putInt(scalarObjects.size());
+		buf.putInt(-1); // padding
+
+		// copy input & side input pointers
+		for(int i=0; i < inputs.size(); i++) {
+			if(i == num_inputs)
+				prepareMatrixPointers(buf, ec, inputs.get(i), tB1);
+			else
+				prepareMatrixPointers(buf, ec, inputs.get(i), false);
 		}
-		else {
-			out[0] = output.getNnz();
-			out[1] = output.getNumRows();
-			out[2] = output.getNumColumns();
-			out[5] = ec.getGPUDensePointerAddress(output);
+
+		// copy output pointers or allocate buffer for reduction
+ 		if(out_obj == null) {
+			long num_blocks = 1;
+			if(this instanceof SpoofCUDACellwise) {
+				int NT = 256;
+				long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns();
+				num_blocks = ((N + NT * 2 - 1) / (NT * 2));
+			}
+			ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks);
+			writeMatrixDescriptorToBuffer(buf, 1, 1, 0, 0, GPUObject.getPointerAddress(ptr[0]), 1);
+		}
+ 		else {
+			prepareMatrixPointers(buf, ec, out_obj, false);
+		}
+
+ 		// copy scalar values (no pointers)
+		for(ScalarObject scalarObject : scalarObjects) {
+			buf.putDouble(scalarObject.getDoubleValue());
 		}
-		return out;
 	}
-	
-	MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs, 
+
+	MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 			ArrayList<ScalarObject> scalarObjects, String outputName);
 	
 	ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 		ArrayList<ScalarObject> scalarObjects);
-	
-	int execute_sp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
-	int execute_dp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
+
+	int execute_dp(long ctx);
+	int execute_sp(long ctx);
+	long getContext();
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
index b5686c7dc85..2632faeb84f 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
@@ -23,11 +23,11 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
 import java.util.ArrayList;
@@ -37,55 +37,40 @@ public class SpoofCUDARowwise extends SpoofRowwise implements SpoofCUDAOperator
 	private static final Log LOG = LogFactory.getLog(SpoofCUDARowwise.class.getName());
 	private final int ID;
 	private final PrecisionProxy call;
-	private Pointer ptr;
+	private final long ctx;
 	
 	public SpoofCUDARowwise(RowType type,  long constDim2, boolean tB1, int reqVectMem, int id,
-		PrecisionProxy ep) {
+		PrecisionProxy ep)
+	{
 		super(type, constDim2, tB1, reqVectMem);
 		ID = id;
 		call = ep;
-		ptr = null;
+		ctx = SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);
 	}
 	
 	@Override public String getName() {
 		return getSpoofType();
 	}
-	
-	@Override public void setScalarPtr(Pointer _ptr) {
-		ptr = _ptr;
-	}
-	
-	@Override public Pointer getScalarPtr() {
-		return ptr;
-	}
-	
-	@Override public void releaseScalarGPUMemory(ExecutionContext ec) {
-		if(ptr != null) {
-			ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr, DMLScript.EAGER_CUDA_FREE);
-			ptr = null;
-		}
-	}
-	
+
 	@Override
 	public ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
-		ArrayList<ScalarObject> scalarObjects) {
+		ArrayList<ScalarObject> scalarObjects)
+	{
 		double[] result = new double[1];
-		Pointer ptr = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType);
-		long[] out = {1,1,1, 0, 0, GPUObject.getPointerAddress(ptr)};
-		int offset = 1;
-		if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, offset), prepareSideInputPointers(ec, inputs, offset, _tB1),
-				out, scalarObjects, 0) != 0) {
-			LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.\n");
-			// ToDo: java fallback
-		}
-		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, result, getName(), false);
+		Pointer[] ptr = new Pointer[1];
+		packDataForTransfer(ec, inputs, scalarObjects, null, 1, ID, 0,_tB1, ptr);
+		if(call.exec(this) != 0)
+			LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
+
+		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr[0], result, getName(), false);
+		ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr[0], DMLScript.EAGER_CUDA_FREE);
 		return new DoubleObject(result[0]);
 	}
 	
 	@Override
 	public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
-		ArrayList<ScalarObject> scalarObjects, String outputName) {
-		
+		ArrayList<ScalarObject> scalarObjects, String outputName)
+	{
 		int m = (int) inputs.get(0).getNumRows();
 		int n = (int) inputs.get(0).getNumColumns();
 		final int n2 = _type.isConstDim2(_constDim2) ? (int)_constDim2 : _type.isRowTypeB1() ||
@@ -93,13 +78,12 @@ public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 		OutputDimensions out_dims = new OutputDimensions(m, n, n2);
 		ec.setMetaData(outputName, out_dims.rows, out_dims.cols);
 		MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols).getKey();
-		
-		int offset = 1;
-		if(call.exec(ec,this, ID, prepareInputPointers(ec, inputs, offset), prepareSideInputPointers(ec, inputs, 
-				offset, _tB1), prepareOutputPointers(ec, out_obj, false), scalarObjects, 0) != 0) {
-			LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.\n");
-			// ToDo: java fallback
-		}
+
+		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,_tB1, null);
+
+		if(call.exec(this) != 0)
+			LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
+
 		return out_obj;
 	}
 	
@@ -110,15 +94,11 @@ public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 	// unused
 	@Override protected void genexec(double[] avals, int[] aix, int ai, SideInput[] b, double[] scalars, double[] c,
 		int ci, int alen, int n, long grix, int rix) { }
-	
-	public int execute_sp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_f(ctx, meta, in, sides, out, scalars);
-	}
-	
-	public int execute_dp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_d(ctx, meta, in, sides, out, scalars);
-	}
-	
-	public static native int execute_f(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
-	public static native int execute_d(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
+
+	public int execute_dp(long ctx) { return execute_d(ctx); }
+	public int execute_sp(long ctx) { return execute_d(ctx); }
+	public long getContext() { return ctx; }
+
+	public static native int execute_d(long ctx);
+	public static native int execute_s(long ctx);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
index fe8d932c448..f2700911fab 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
@@ -23,6 +23,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import jcuda.Pointer;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
@@ -321,4 +322,6 @@ public void reset() {
 			currColPos = 0;
 		}
 	}
+
+	public static native int getNativeStagingBuffer(Pointer ptr, long context, int size);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
index 9fd1bcfa481..b54d6fe4f94 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
@@ -35,7 +35,6 @@
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageItem;
 import org.apache.sysds.runtime.lineage.LineageItemUtils;
 import org.apache.sysds.utils.GPUStatistics;
@@ -52,21 +51,14 @@ public class SpoofCUDAInstruction extends GPUInstruction {
 	public final CPOperand _out;
 	
 	public static class SinglePrecision extends SpoofCUDAOperator.PrecisionProxy {
-		public int exec(ExecutionContext ec, SpoofCUDAOperator op, int opID, long[] in, long[] sides, long[] out,
-				ArrayList<ScalarObject> scalarObjects, long grix) {
-			op.setScalarPtr(transferScalars(ec, op, Sizeof.FLOAT, scalarObjects));
-			long[] _metadata = { opID, grix, in.length, sides.length, out.length, scalarObjects.size() };
-			return op.execute_sp(ctx, _metadata, in, sides, out, GPUObject.getPointerAddress(op.getScalarPtr()));
+		public int exec(SpoofCUDAOperator op) {
+			return op.execute_sp(ctx);
 		}
 	}
 	
 	public static class DoublePrecision extends SpoofCUDAOperator.PrecisionProxy {
-		public int exec(ExecutionContext ec, SpoofCUDAOperator op, int opID, long[] in, long[] sides, long[] out,
-				ArrayList<ScalarObject> scalarObjects, long grix) {
-			if(!scalarObjects.isEmpty())
-				op.setScalarPtr(transferScalars(ec, op, Sizeof.DOUBLE, scalarObjects));
-			long[] _metadata = { opID, grix, in.length, sides.length, out.length, scalarObjects.size() };
-			return op.execute_dp(ctx, _metadata, in, sides, out, GPUObject.getPointerAddress(op.getScalarPtr()));
+		public int exec(SpoofCUDAOperator op) {
+			return op.execute_dp(ctx);
 		}
 	}
 	
@@ -101,7 +93,6 @@ public static SpoofCUDAInstruction parseInstruction(String str) {
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
 
 		ArrayList<CPOperand> inlist = new ArrayList<>();
-//		Integer op_id =  CodegenUtils.getCUDAopID(parts[2].split("\\.")[1]);
 		Integer op_id = CodegenUtils.getCUDAopID(parts[2]);
 		Class<?> cla = CodegenUtils.getClass(parts[2]);
 		SpoofOperator fallback_java_op = CodegenUtils.createInstance(cla);
@@ -141,11 +132,9 @@ else if(_out.getDataType() == Types.DataType.SCALAR) {
 				ScalarObject out = _op.execute(ec, inputs, scalars);
 				ec.setScalarOutput(_out.getName(), out);
 			}
-			
-			_op.releaseScalarGPUMemory(ec);
 		}
 		catch(Exception ex) {
-			LOG.error("SpoofCUDAInstruction: " + _op.getName() + " operator failed to execute. Trying Java fallback.(ToDo)\n");
+			LOG.error("SpoofCUDAInstruction: " + _op.getName() + " operator failed to execute :(\n");
 			
 			throw new DMLRuntimeException(ex);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index efbf2011bd2..633e3fc7d26 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -1168,4 +1168,5 @@ public long getDensePointerAddress() {
 	
 	public static long getPointerAddress(Pointer p) {
 		return (p == null) ?  0 : getPointerAddressInternal(p);
-	}}
+	}
+}